diff --git a/include/poplibs_support/Algorithm.hpp b/include/poplibs_support/Algorithm.hpp index b9368c54..4926cec6 100644 --- a/include/poplibs_support/Algorithm.hpp +++ b/include/poplibs_support/Algorithm.hpp @@ -26,6 +26,11 @@ template constexpr static auto ceildiv(T x, U y) { return (x + y - 1) / y; } +/// ceildiv with 0/0 = 0 +template static inline T ceildiv0(T q, T d) { + return q + d != 0 ? (q + d - 1) / d : 0; +} + template constexpr static auto floordiv(T x, U y) { static_assert(std::is_unsigned::value && std::is_unsigned::value, "Only valid for unsigned integral types"); @@ -79,6 +84,18 @@ constexpr static std::pair balancedPartition(T n, T d) { return std::make_pair(a, b); } +/// Compute the nth triangular number. +template static inline T nthTriangular(T n) { + static_assert(std::is_unsigned_v); + return (n * (n + 1)) / 2; +} + +/// Check if a number is a power of 2. +template static inline bool isPowerOf2(T n) { + static_assert(std::is_unsigned_v); + return ((n & (n - 1)) == 0); +} + } // end namespace poplibs_support #endif // poplibs_support_Algorithm_hpp diff --git a/include/popops/SortOrder.hpp b/include/popops/SortOrder.hpp new file mode 100644 index 00000000..8d8bed86 --- /dev/null +++ b/include/popops/SortOrder.hpp @@ -0,0 +1,24 @@ +// Copyright (c) 2021 Graphcore Ltd. All rights reserved. + +#ifndef _popops_SortOrder_hpp_ +#define _popops_SortOrder_hpp_ + +#include + +namespace popops { + +/// Defines a required order for sorting operations. +enum class SortOrder { + /// No ordering is required. + NONE, + /// Sort in ascending order. + ASCENDING, + /// Sort in descending order. + DESCENDING +}; + +std::ostream &operator<<(std::ostream &os, const SortOrder &o); + +} // end namespace popops + +#endif // _popops_SortOrder_hpp_ diff --git a/include/popops/TopK.hpp b/include/popops/TopK.hpp new file mode 100644 index 00000000..fef95cf7 --- /dev/null +++ b/include/popops/TopK.hpp @@ -0,0 +1,111 @@ +// Copyright (c) 2020 Graphcore Ltd. All rights reserved. + +#ifndef _popops_TopK_hpp_ +#define _popops_TopK_hpp_ + +#include +#include +#include + +#include + +namespace popops { + +/** Parameters for topK* APIs + */ +struct TopKParams { + /// The number of outputs from the top k operation. + /// This must be less or equal the number of elements in the innermost + /// dimension of the tensor used as input to the operation. + unsigned k; + /// If true, return the top k largest elements. Otherwise return the + /// top k smallest elements. + bool largest; + /// The required ordering of elements in the resulting tensor. + SortOrder sortOrder; + + TopKParams(unsigned k, bool largest, SortOrder sortOrder) noexcept; +}; + +std::ostream &operator<<(std::ostream &os, const TopKParams &p); + +/** Create an return a new tensor laid out optimally to be used as + * an input to a topK operation with the given parameters. + * + * \param graph The Poplar graph to add the tensor to. + * \param type The Poplar type of elements in the returned tensor. + * \param shape The shape of the returned tensor. + * \param params The parameters of the top k that the returned tensor + * will be used as input to. + * \param debugContext Optional debug information. + * + * \returns A newly created tensor with shape \p shape and full tile mapping. + */ +poplar::Tensor createTopKInput(poplar::Graph &graph, const poplar::Type &type, + const std::vector &shape, + const TopKParams ¶ms, + const poplar::DebugContext &debugContext = {}); + +/** Return the top k values in the innermost dimension of a tensor. + * + * \param graph The Poplar graph to add the operation to. + * \param prog The Poplar sequence to add the operation to. + * \param t The tensor in which to find the top-k values in + * the innermost dimension. + * \param params The parameters of the top k. + * \param debugContext Optional debug information. + * + * \returns A tensor with the top k values found in the innermost dimension + * of \p t. + */ +poplar::Tensor topK(poplar::Graph &graph, poplar::program::Sequence &prog, + const poplar::Tensor &t, const TopKParams ¶ms, + const poplar::DebugContext &debugContext = {}); + +/** Return the top k values in the innermost dimension of a tensor along + * with the permutation of another tensor with respect to the values. + * + * \param graph The Poplar graph to add the operation to. + * \param prog The Poplar sequence to add the operation to. + * \param key The tensor in which to find the top-k values in + * the innermost dimension. + * \param value A tensor with the same shape as \p key for which to + * get the permutation with respect to \p key. + * \param params The parameters of the top k. + * \param debugContext Optional debug information. + * + * \returns A pair of tensors. The first contains the top k values found + * in the innermost dimension of \p key. The second contains the + * permutation of the tensor \p value with respect to the tensor + * \p key. + */ +std::pair +topKKeyValue(poplar::Graph &graph, poplar::program::Sequence &prog, + const poplar::Tensor &keys, const poplar::Tensor &values, + const TopKParams ¶ms, + const poplar::DebugContext &debugContext = {}); + +/** Return the top k values in the innermost dimension of a tensor along + * with the indices of those values in the input tensor in the innermost + * dimension. + * + * \param graph The Poplar graph to add the operation to. + * \param prog The Poplar sequence to add the operation to. + * \param t The tensor in which to find the top-k values in + * the innermost dimension. + * \param params The parameters of the top k. + * \param debugContext Optional debug information. + * + * \returns A pair of tensors. The first contains the top k values found + * in the innermost dimension of \p t. The second contains the + * indices of those values in the innermost dimension of \p t in + * the original input. + */ +std::pair +topKWithPermutation(poplar::Graph &graph, poplar::program::Sequence &prog, + const poplar::Tensor &t, const TopKParams ¶ms, + const poplar::DebugContext &debugContext = {}); + +} // end namespace popops + +#endif // _popops_TopK_hpp_ diff --git a/lib/poplibs_test/Util.cpp b/lib/poplibs_test/Util.cpp index 0367d4fa..f77a9db1 100644 --- a/lib/poplibs_test/Util.cpp +++ b/lib/poplibs_test/Util.cpp @@ -199,6 +199,7 @@ bool checkIsClose(FPType a, FPType b, double relativeTolerance) { template bool checkIsClose(bool, bool, double); template bool checkIsClose(int, int, double); +template bool checkIsClose(unsigned, unsigned, double); template bool checkIsClose(float, float, double); template bool checkIsClose(double, double, double); diff --git a/lib/popops/BitonicTopK.cpp b/lib/popops/BitonicTopK.cpp new file mode 100644 index 00000000..8f74ba15 --- /dev/null +++ b/lib/popops/BitonicTopK.cpp @@ -0,0 +1,982 @@ +// Copyright (c) 2020 Graphcore Ltd. All rights reserved. + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +using namespace poplar; +using namespace poplar::program; +using namespace poplibs_support; +using namespace poputil; + +namespace poputil { + +template <> poplar::ProfileValue toProfileValue(const popops::SortOrder &o) { + switch (o) { + case popops::SortOrder::NONE: + return poplar::ProfileValue("NONE"); + case popops::SortOrder::ASCENDING: + return poplar::ProfileValue("ASCENDING"); + case popops::SortOrder::DESCENDING: + return poplar::ProfileValue("DESCENDING"); + default: + return poplar::ProfileValue(""); + } +} +} // namespace poputil + +namespace popops { +namespace bitonic { + +/** This returns the number of comparisons we will do at the given distance + * into a flat array with n elements. + * + * e.g. n = 15 and distance = 4 + * + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * | | | | | | | | | | | | | | | | + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * +---------------> +---------------> + * +---------------> +---------------> + * +---------------> +---------------> + * +---------------> + * + * gives the number of comparisons as 7. + */ +template +static inline T numComparisonsAtDistance(T n, T distance) { + const auto q = floordiv(n, distance * 2); + const auto r = n % (distance * 2); + const auto comparisons = q * distance + r - std::min(r, distance); + return comparisons; +} + +namespace { + +struct TensorKey { + unsigned n; + unsigned comparisons; + unsigned distance; + TensorKey(unsigned n, unsigned distance, unsigned nActive) + : n(n), comparisons(numComparisonsAtDistance(nActive, distance)), + distance(distance) {} + inline bool operator<(const TensorKey &other) const { + return std::tie(n, comparisons, distance) < + std::tie(other.n, other.comparisons, other.distance); + } + inline bool operator==(const TensorKey &other) const { + return std::tie(n, comparisons, distance) == + std::tie(other.n, other.comparisons, other.distance); + } +}; + +struct TensorKeyHash { + std::size_t operator()(const TensorKey &k) const { + std::size_t seed = 0; + boost::hash_combine(seed, k.n); + boost::hash_combine(seed, k.comparisons); + boost::hash_combine(seed, k.distance); + return seed; + } +}; + +using TensorCache = std::unordered_map; + +} // end anonymous namespace + +/** Allocates a tensor to be used when doing a compare and swap step + * with the given parameters. In particular this manages the correct + * mapping of elements to the tiles on which they will be processed + * and correct balancing of elements not processed across remaining + * tiles. + * + * This is closely tied to the compareAndSwapAtDistance function. + */ +static Tensor allocate(Graph &graph, const Type &type, std::size_t n, + std::size_t nActive, std::size_t distance, + const DebugNameAndId &dnai) { + const auto t = graph.addVariable(type, {n}, {dnai}); + const auto numTiles = graph.getTarget().getNumTiles(); + + const auto comparisons = numComparisonsAtDistance(nActive, distance); + // Allocation in each stage is done by first allocating to tiles elements + // that are 'active' i.e. take part in a comparison at the given distance. + const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles); + const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile); + + // We know from the total number of elements n what the layout will be for + // a tensor with distance equal to 1, which we use to determine how to + // spread inactive elements over tiles. + const auto maxActivePairs = floordiv(n, 2u); + const auto maxAllocatedPairsPerTile = ceildiv(maxActivePairs, numTiles); + const auto maxUsedTiles = ceildiv0(maxActivePairs, maxAllocatedPairsPerTile); + + // First allocate first comparisons * 2 elements to tiles according to + // which will process those comparisons. + for (unsigned tile = 0; tile < activeTiles; ++tile) { + const auto begin = tile * maxComparisonsPerTile; + const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons); + graph.setTileMapping(t.slice(begin * 2, end * 2), tile); + } + // Then allocate inactive elements to tiles, balancing total number of + // elements allocated as much as possible. + auto offset = comparisons * 2; + auto inactiveRemaining = n - comparisons * 2; + for (unsigned tile = 0; tile < maxUsedTiles; ++tile) { + std::size_t activePairsThisTile = 0; + if (tile < activeTiles) { + const auto begin = tile * maxComparisonsPerTile; + const auto end = + std::min((tile + 1) * maxComparisonsPerTile, comparisons); + activePairsThisTile = (end - begin); + } + const auto inactiveThisTile = + std::min((maxAllocatedPairsPerTile - activePairsThisTile) * 2, + inactiveRemaining); + graph.setTileMapping(t.slice(offset, offset + inactiveThisTile), tile); + inactiveRemaining -= inactiveThisTile; + offset += inactiveThisTile; + } + + // If there is an odd element, we'll arbitrarily map it to the last tile. + if (offset < n) { + // We assume there is only ever 1 'odd' element + assert(n - offset == 1); + graph.setTileMapping(t.slice(offset, offset + 1), + std::max(maxUsedTiles, 1ul) - 1); + offset += 1; + } + + assert(offset == n); + + return t; +} + +/** This gives the inverse of a permutation of elements of a tensor. + */ +static std::vector +getInversePermutation(const std::vector &is) { + std::vector matchingSlices; + matchingSlices.reserve(is.size()); + std::size_t offset = 0; + for (const auto &i : is) { + matchingSlices.emplace_back(offset, offset + i.size()); + offset += i.size(); + } + + std::vector sortedIndices(is.size()); + std::iota(sortedIndices.begin(), sortedIndices.end(), 0); + std::sort(sortedIndices.begin(), sortedIndices.end(), + [&](const auto a, const auto b) { + return is[a].begin() < is[b].begin(); + }); + + std::vector inverse; + inverse.reserve(is.size()); + for (const auto &i : sortedIndices) { + inverse.push_back(matchingSlices[i]); + } + return inverse; +} + +/** Given an offset into a number of comparisons to do and the distance + * at which to compare, this gives the offset in elements for the lhs + * element taking part on that comparison in a flat array. + * + * The calculation is essentially taking the number of multiples of + * the distance d * 2 to get number of elements plus any remainder. + * + * e.g. for distance = 4 we have a flat array and the comparisons that + * will be done on that array + * + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * | | | | | | | | | | | | | | | + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * +---------------> +---------------> + * 0 +---------------> 4 +---------------> + * 1 +---------------> 5 + * 2 +---------------> + * 3 + * + * For comparison offset 3 we have no multiples of distance (4), plus + * the remainder 3, so the offset for the lhs element in that comparison + * is 3 as can be seen from the diagram. For comparison offset 5 we have + * 1 multiple of 4, multiplied by 2 and add the remainder 1 equals 9 which + * again is evident from the diagram. + */ +template +static inline T comparisonToElemOffset(T offset, T distance) { + return roundDown(offset, distance) * 2 + offset % distance; +} + +/** Given a tensor which was allocated with the given parameters, reorder + * it to give the canonical order. The canonical order is with all elements + * in the order they were given to the top-k operation. + * + * We use the canonical order as a common ordering in which to transform + * tensors between different steps of the top-k operation. + */ +static Tensor toCanonicalOrder(const Graph &graph, const Tensor &t, + std::size_t distance, std::size_t nActive) { + assert(t.rank() == 1); + + const auto n = t.dim(0); + + // Already in canonical order + if (distance == 1 && nActive == n) { + return t; + } + + const auto numTiles = graph.getTarget().getNumTiles(); + + const auto comparisons = numComparisonsAtDistance(nActive, distance); + const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles); + const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile); + + // This function just combines slices that are actually continuous. + std::vector slices; + // Just reserve something, the exact number is hard to calculate up front. + slices.reserve(activeTiles); + const auto appendSlice = [&](const auto begin, const auto end) { + if (begin == end) { + return; + } + if (!slices.empty() && slices.back().end() == begin) { + slices.back() = Interval(slices.back().begin(), end); + } else { + slices.emplace_back(begin, end); + } + }; + + // We actually build the permutation of the canonically ordered tensor's + // elements you would take to get the input tensor and then get the inverse + // permutation and apply that to the tensor to retrieve the canonical form. + for (unsigned tile = 0; tile < activeTiles; ++tile) { + auto begin = tile * maxComparisonsPerTile; + const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons); + + if (const auto pre = std::min(distance - begin % distance, end - begin)) { + const auto elemOffset = comparisonToElemOffset(begin, distance); + appendSlice(elemOffset, elemOffset + pre); + appendSlice(elemOffset + distance, elemOffset + distance + pre); + begin += pre; + } + // Add multiples of 2 * distance chunks of elements. + if (const auto multipleOfDistanceComparisons = + roundDown(end - begin, distance)) { + appendSlice(begin * 2, (begin + multipleOfDistanceComparisons) * 2); + begin += multipleOfDistanceComparisons; + } + if (const auto post = end - begin) { + const auto elemOffset = comparisonToElemOffset(begin, distance); + appendSlice(elemOffset, elemOffset + post); + appendSlice(elemOffset + distance, elemOffset + distance + post); + begin += post; + } + assert(begin == end); + } + + // This method is tied closely with the bitonic allocate method, any inactive + // elements when comparing at a particular distance are always at the end of + // the tensor so we can just tack them on. + const auto remainingOffset = comparisonToElemOffset(comparisons, distance); + appendSlice(remainingOffset, remainingOffset + n - comparisons * 2); + + return concat(t.slices(getInversePermutation(slices))); +} + +/** Allocate a new tensor and rearrange the given input into it in preparation + * for a compare and swap step with the given parameters. + * + * Assumes src is in canonical order. + */ +static Tensor rearrangeForStep(Graph &graph, Sequence &prog, const Tensor &src, + unsigned dstDistance, unsigned dstActive, + TensorCache &tensorCache, + const DebugNameAndId &dnai) { + const auto n = src.dim(0); + const TensorKey dstKey(n, dstDistance, dstActive); + auto cacheIt = tensorCache.find(dstKey); + if (cacheIt == tensorCache.end()) { + const auto t = + allocate(graph, src.elementType(), n, dstActive, dstDistance, dnai); + cacheIt = tensorCache.emplace(dstKey, t).first; + } + const auto &dst = cacheIt->second; + const auto &dstReordered = + toCanonicalOrder(graph, dst, dstDistance, dstActive); + prog.add(Copy(src, dstReordered)); + return dst; +} + +/** WorklistBuilder is a one-time use class that is used per-tile to + * build worklists based on the available work on a tile. + */ +struct WorklistBuilder { + const unsigned numWorkers; + const unsigned maxComparisonsPerWorker; + const bool initialOrder; + const unsigned distanceToChangeOrder; + unsigned globalComparisonOffset; + + using Worklists = std::vector>; + Worklists lists; + + // The number of entries in the worklist for the current worker. + unsigned numWorkerEntries = 0; + // The number of remaining comparisons this worker can do based + // on the maximum per worker. + unsigned remainingWorkerComparisons = 0; + // The offset into the data on this tile so far. + unsigned tileOffset = 0; + + WorklistBuilder(unsigned numWorkers, unsigned maxComparisonsPerWorker, + bool initialOrder, unsigned distanceToChangeOrder, + unsigned globalComparisonOffset) + : numWorkers(numWorkers), + maxComparisonsPerWorker(maxComparisonsPerWorker), + initialOrder(initialOrder), + distanceToChangeOrder(distanceToChangeOrder), + globalComparisonOffset(globalComparisonOffset) { + lists.reserve(numWorkers); + } + + // Work is limited by and assumed to be a multiple of the given distance + // hence the parameters are the distance and a multiple, rather than a more + // flexible total number of comparisons. + void addWork(unsigned distance, unsigned repeats = 1) { + const auto numComparisons = distance * repeats; + auto remainingComparisons = numComparisons; + + unsigned comparisonOffset = 0; + // NOTE: The encoding of the worklists is described alongside the c++ + // implementation of the vertex. + while (remainingComparisons != 0) { + // Start a new worker + if (remainingWorkerComparisons == 0) { + // Fill out the number of entries for the last worker if there + // was one. + if (numWorkerEntries != 0) { + assert(!lists.empty()); + lists.back().front() = numWorkerEntries - 1; + numWorkerEntries = 0; + } + + lists.emplace_back(); + // Reserve a spot for number of entries in the worklist + lists.back().emplace_back(); + const auto currWorkOffset = + comparisonToElemOffset(comparisonOffset, distance); + lists.back().emplace_back(tileOffset + currWorkOffset); + + // Knowing the starting order of comparison over the whole + // input, we can work out for the slice of comparisons that + // this worker does what order it should start comparing + // in by using the starting offset for this worker into the + // whole input to determine how many changes of order have come before. + const bool workerInitialOrder = + initialOrder ^ + ((globalComparisonOffset / distanceToChangeOrder) & 1u); + // The worker keeps a count of comparisons left before it must + // flip the order in which it compares and swaps. Because the + // worker may start at any offset into the comparisons to be + // done over the whole input, we provide a count to initialise + // the counter with based on the offset into the whole input that + // this worker starts its work. The worker will calculate + // the remaining compare and swaps to do before flipping the order + // by taking distanceToChangeOrder - initialCount. + const auto initialCount = + globalComparisonOffset % distanceToChangeOrder; + assert(initialCount < (1u << 31)); + // We encode the initial direction and initial offset into + // 32-bits in the worklist where the upper 31 bits are allocated + // to the initial count and the lowest bit is allocated to the + // initial order. + const unsigned short initialLower = + ((initialCount & ((1u << 15u) - 1)) << 1u) | workerInitialOrder; + const unsigned short initialUpper = + (initialCount >> 15u) & ((1u << 16) - 1); + lists.back().emplace_back(initialLower); + lists.back().emplace_back(initialUpper); + + remainingWorkerComparisons = maxComparisonsPerWorker; + + // Similar to initialCount, due to the fact that a worker can + // start at any offset into the work to be done on this tile/ + // over the whole input, we provide an initial count for the + // innermost loop that would ordinarily otherwise be set to + // 'distance' for the first entry in the worklist. + const auto firstInnerCount = std::min( + distance - comparisonOffset % distance, + std::min(remainingComparisons, remainingWorkerComparisons)); + lists.back().emplace_back(firstInnerCount); + } + + const auto comparisonsThisEntry = + std::min(remainingComparisons, remainingWorkerComparisons); + + lists.back().emplace_back(distance); + lists.back().emplace_back(comparisonsThisEntry); + + comparisonOffset += comparisonsThisEntry; + globalComparisonOffset += comparisonsThisEntry; + remainingWorkerComparisons -= comparisonsThisEntry; + remainingComparisons -= comparisonsThisEntry; + ++numWorkerEntries; + } + tileOffset += numComparisons * 2; + } + + // Invalidates the builder and returns the worklists that were built. + Worklists finish() { + if (numWorkerEntries != 0) { + assert(!lists.empty()); + lists.back().front() = numWorkerEntries - 1; + numWorkerEntries = 0; + } + return std::move(lists); + } +}; + +/** The basic building block for the algorithm. This step + * performs a compare and swap at the given distance if the input + * tensors were flat array in canonical order. + * + * e.g. nActive = 15, distance = 4, distanceToChangeOrder = 4 + * + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * | | | | | | | | | | | | | | | | + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * +---------------> <---------------+ + * +---------------> <---------------+ + * +---------------> <---------------+ + * +---------------> + * + * e.g. nActive = 15, distance = 2, distanceToChangeOrder = 4 + * + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * | | | | | | | | | | | | | | | | + * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + * +-------> +-------> <-------+ <-------+ + * +-------> +-------> <-------+ + * + * In these examples each arrow indicates the 2 elements are compared, + * and if the element the arrow points to is less than the element it + * starts from, the elements are swapped. + * + * We add some limitations to this step in order to appease the + * hardware. The main one is that the distance is consistent for the + * entire operation, as is the distance at which we flip the order of + * comparison. + * + * The actual implementation maps comparisons to tiles by taking the + * flat array, and evenly splitting the work amongst tiles. Where the + * lhs and rhs of the comparison reside on different tiles, we gather + * the comparisons to be done in chunks of powers of 2. This limits + * the amount of exchange code required to rearrange for each step, + * and limits the overhead we add would add by having the vertex deal + * with different distances at which to compare to a number of different + * distances that is roughly logarithmic with respect to the number + * of elements per tile (in each compare and swap step). + */ +static std::string getVertexClass(const Tensor &keys, + const std::optional &values) { + if (values) { + return templateVertex("popops::CompareAndSwapAtDistanceKeyVal", + keys.elementType(), values->elementType()); + } else { + return templateVertex("popops::CompareAndSwapAtDistance", + keys.elementType()); + } +} +static void +compareAndSwapAtDistance(Graph &graph, Sequence &prog, const Tensor &keys, + const std::optional &values, unsigned distance, + unsigned distanceToChangeOrder, bool initialOrder, + unsigned nActive, const DebugNameAndId &dnai) { + assert(!values || keys.shape() == values->shape()); + + const auto vertexClass = getVertexClass(keys, values); + + const auto &target = graph.getTarget(); + const auto numTiles = target.getNumTiles(); + const auto numWorkers = target.getNumWorkerContexts(); + + const auto comparisons = numComparisonsAtDistance(nActive, distance); + const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles); + const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile); + + const auto cs = graph.addComputeSet({dnai, "CompareAndSwap"}); + for (unsigned tile = 0; tile < activeTiles; ++tile) { + auto begin = tile * maxComparisonsPerTile; + const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons); + const auto comparisonsThisTile = end - begin; + + const auto v = graph.addVertex(cs, vertexClass); + graph.setTileMapping(v, tile); + + graph.connect(v["keys"], keys.slice(begin * 2, end * 2).flatten()); + if (values) { + graph.connect(v["values"], values->slice(begin * 2, end * 2).flatten()); + } + + graph.setInitialValue(v["distanceToChangeOrder"], distanceToChangeOrder); + + // We just flatten all the comparisons to be done and evenly spread + // them amongst workers for now without consideration of how the split + // might introduce uneven overheads between workers. + const auto comparisonsPerWorker = ceildiv(comparisonsThisTile, numWorkers); + WorklistBuilder builder(numWorkers, comparisonsPerWorker, initialOrder, + distanceToChangeOrder, begin); + + if (const auto pre = std::min(distance - begin % distance, end - begin)) { + builder.addWork(pre); + begin += pre; + } + if (const auto multipleOfDistanceComparisons = + floordiv(end - begin, distance)) { + builder.addWork(distance, multipleOfDistanceComparisons); + begin += multipleOfDistanceComparisons * distance; + } + if (const auto post = end - begin) { + builder.addWork(post); + begin += post; + } + assert(begin == end); + + const auto &worklists = builder.finish(); + + const auto worklistsField = v["worklists"]; + graph.setFieldSize(worklistsField, worklists.size()); + for (unsigned i = 0; i < worklists.size(); ++i) { + const auto t = + graph.addConstant(UNSIGNED_SHORT, {worklists[i].size()}, + worklists[i].data(), {dnai, "worklists"}); + graph.setTileMapping(t, tile); + graph.connect(worklistsField[i], t); + } + } + prog.add(Execute(cs)); +} + +/** Create an input for a top-k laid out in the most efficient + * way. + */ +Tensor createTopKInputImpl(Graph &graph, const Type &type, + const std::vector &shape, + const DebugNameAndId &dnai) { + if (shape.empty()) { + throw poplibs_error("shape must have at least 1 dimension"); + } + const auto numElems = product(shape); + const auto n = shape.back(); + const auto b = numElems / n; + auto t = allocate(graph, type, b * n, b * n, 1, dnai); + return t.reshape({n, b}).transpose().reshape(shape); +} + +/** Core implementation for all variants. + * + * All the variants are implemented by application of our basic + * compareAndSwapAtDistance operation some number of times with the + * right parameters. A key limitation of compareAndSwapAtDistance is that + * it requires us to compare and swap at a consistent distance across the + * entire array, flipping the order imposed at another consistent interval. + * + * ========= Sort ========= + * I will not describe a bitonic sorting network for powers of 2 as this is + * well documented. For n a power of 2 the above limitation on + * compareAndSwapAtDistance is not a problem, as the size of partitions in + * each step (the distance at which we compare and swap) is always a + * consistent factor of 2 greater than the last step across the whole n + * elements to be sorted. + * + * For n not a power of 2 we compose the sorting network of power-of-2 + * sized sorting networks. e.g. if n=100, we compose the full sort with + * smaller sorting networks of size 64, 32, 4 (64 + 32 + 4 = 100). The + * result of these power of 2 sized networks are merged and re-sorted + * hierarchically. + * + * With regards the limitation on compare and swap described above, + * the key insight is that when all the networks are of size a power of 2, + * sorting the sub-sequences into order always uses ascending powers of 2 + * distances at which to compare and swap. Once the sub-sequences are in + * order merging them to form a larger non-power of 2 sized sequence is + * always a compare and swap at the next largest power of 2. We can therefore + * apply a series of compare and swap steps with increasing powers of 2 + * distances to form our final fully sorted sequence and in each step the + * distance to compare and swap is always consistent across the whole + * array. + * + * When merging the result of our power of 2 sized sorting networks, + * we must take care. In these cases we can do one of the following: + * + * * Given an ascending/descending bitonic sequence s of size n and + * known inflection point p, we can compare and swap at distance + * p with descending order, and the result is guaranteed to be such + * that all in the set of elements left of p are greater than all + * in the set of elements right of p. + * * Conversely, given a descending/ascending bitonic sequence s of + * size n and known inflection point p, we can compare and swap at + * distance p with ascending order, and the result is guaranteed + * to be such that all in the set of elements left of p are less + * than all in the set of elements right of p. + * + * In order to use the above this means each time we merge sorted power + * of 2 sequences, they must be sorted in a particular order. Different + * powers of 2 will take different numbers of compare and swap steps to + * end up in the correct order for merging. Since the smaller powers of + * 2 must always take part in the same or fewer compare and swap operations + * than the larger powers of 2, we can carefully omit them from the + * right compare and swap operations on larger powers of 2 to ensure they + * still end up in the right order for merging and we get the correct + * result. + * + * ========= Top-K ========= + * In order to handle top-k, the general idea is to build sorted sequences + * of size k with alternating ascending/descending order, allowing us to + * merge 2 sequences of k with opposing ordering giving a result where one of + * the sequences has elements which are all greater than the elements in + * the other sequence. Each time we do this, we discard the smaller + * sequence and re-sort the sequences of k elements into ascending/ + * descending order and repeat until we are left with k elements. + * + * We make a small modification to this which is we actually first build + * sorted sequences of k' elements where k' is the next power of 2 greater + * then or equal to k. This again lets us keep our distance of compare and + * swap consistent in each step and doesn't further complicate the compare + * and swap operation. To get the final result if k is not a power of 2 + * we keep sorting and discarding descreasing sized powers of 2 until we + * are left with just k elements. + */ +std::pair topKImpl(Graph &graph, Sequence &prog, + const Tensor &t_, + const std::optional &other_, + const unsigned k, const bool largest, + const bool sorted, const bool ascendingOrder, + const DebugNameAndId &dnai) { + + const auto inputType = t_.elementType(); + if (inputType != HALF && inputType != FLOAT) { + throw poplibs_error( + "Unsupported data type for top-k " + inputType.toString() + + ". half and float are the only supported types at present."); + } + + if (other_) { + const auto otherType = other_->elementType(); + if (otherType != UNSIGNED_INT) { + throw poplibs_error("Unsupported data type for other tensor in top-k " + + otherType.toString() + + ". Only unsigned int is supported at present"); + } + if (other_->shape() != t_.shape()) { + throw poplibs_error("t.shape() (" + toString(t_.shape()) + + " != other.shape() (" + toString(other_->shape()) + + "). Other tensor in topKKeyValue must have matching " + "shape to tensor in which to find top-k."); + } + } + + if (t_.rank() == 0) { + throw poplibs_error("t must have at least one dimension"); + } + + const std::vector outputShape = [&] { + auto s = t_.shape(); + s.back() = k; + return s; + }(); + + // We use {n, batch} as our canonical internal representation for + // the shape. + auto t = + t_.rank() >= 2 ? t_.flatten(0, t_.rank() - 1) : t_.flatten().expand({0}); + t = t.transpose(); + std::optional other; + if (other_) { + other = other_->rank() >= 2 ? other_->flatten(0, other_->rank() - 1) + : other_->flatten().expand({0}); + other = other->transpose(); + } + + unsigned n = t.dim(0); + const unsigned b = t.dim(1); + + logging::popops::debug("bitonicTopK(batchSize={}, n={}, k={}, sorted={}, " + "haveOther={}, debugPath='{}')", + b, n, k, sorted, (other ? "true" : "false"), + dnai.getPathName()); + + if (k > n) { + throw poplibs_error( + "k (" + std::to_string(k) + + ") must be less than or equal to the number of input elements (" + + std::to_string(n) + ")"); + } + + const auto logK = ceilLog2(k); + const auto logN = ceilLog2(n); + // We define k' as the next power of 2 greater or equal than k. + const auto kDash = (1u << logK); + const auto stepsToSortK = nthTriangular(logK); + const auto totalSteps = nthTriangular(logN); + + logging::popops::debug( + "bitonicTopKImpl: Calculated no. of steps: total={}, k={}", totalSteps, + stepsToSortK); + + // Edge cases where the output is zero-sized. + if (b * k == 0) { + Tensor tResult = + graph.addVariable(t.elementType(), outputShape, {dnai, "keys"}); + Tensor otherResult; + if (other) { + otherResult = graph.addVariable(other->elementType(), outputShape, + {dnai, "values"}); + } + return std::make_pair(std::move(tResult), std::move(otherResult)); + } + + t = t.flatten(); + if (other) { + other = other->flatten(); + } + + // Handle some trivial cases where this is a no-op. + // + // * We already have the top/bottom k elements and the result doesn't + // need to sorted. + // * n is 1, the output is sorted by default. + if ((k == n && !sorted) || n == 1) { + // We assume the result does not alias the input and is writeable so + // ensure that is the case. + t = poputil::duplicate(graph, t, prog, {dnai, "keys"}); + t = t.reshape({k, b}).transpose().reshape(outputShape); + if (other) { + other = poputil::duplicate(graph, *other, prog, {dnai, "values"}); + other = other->reshape({k, b}).transpose().reshape(outputShape); + } + return std::make_pair(std::move(t), other.value_or(Tensor{})); + } + + // We use the float path to implement half data type for now. + const auto dataType = FLOAT; + if (inputType != dataType) { + t = cast(graph, t, dataType, prog, dnai); + } + + // Because we always discard the upper of a pair of sequences of k' + // elements, the merge order when merging sequences is based off + // leaving the top/bottom k' elements in each pair of k' sequences + // in the lower k' elements. + const bool mergeKSequencesOrder = !largest; + const bool oddKSequences = bool((n / kDash) & 1u); + + TensorCache tCache, otherCache; + for (unsigned mergeStep = 0; mergeStep < logN; ++mergeStep) { + const auto logMergeDistance = std::min(mergeStep, logK); + const auto mergeDistance = 1u << logMergeDistance; + const auto stepName = "Merge" + std::to_string(mergeStep); + + const bool mergeOrder = [&] { + bool order; + if (mergeStep >= logK) { + // Use the merge order for merging sequences of k' elements if that + // is what we are doing. + order = mergeKSequencesOrder; + } else { + // If this is the last step, we need to sort in the desired order. + if (mergeStep == logN - 1) { + order = ascendingOrder; + // If this is the last step of sorting k' sequences, make sure they + // are sorted in the order needed for merging sequences of k' + // elements. + } else if (mergeStep + 1 == logK) { + order = !mergeKSequencesOrder; + } else { + // Otherwise, we merge based on creating sorted sequences in the + // order needed either to create the final sorted sequence or to + // merge k' sequences correctly. + order = logK != logN ? (mergeKSequencesOrder ^ oddKSequences) + : !ascendingOrder; + } + } + return order; + }(); + + // As mentioned at the signature of this function, we must sometimes + // omit the parts of n composed of smaller powers of 2 in order to + // result in the correctly ordered sequence once merged with another + // power of 2. This check was sort of experimentally worked out. In + // the interests of time, I'm leaving it as it works but this should + // be justified and there may be a simpler calculation. + const auto nMod2d = n % (mergeDistance * 2); + const auto nMod4d = n % (mergeDistance * 4); + auto nThisStep = n; + if (mergeStep + 1 < logK && (nMod2d == 0 || nMod2d == nMod4d)) { + nThisStep -= nMod4d; + } + + t = rearrangeForStep(graph, prog, t, mergeDistance * b, nThisStep * b, + tCache, {dnai, "keys" + stepName}); + if (other) { + other = rearrangeForStep(graph, prog, *other, mergeDistance * b, + nThisStep * b, otherCache, + {dnai, "values" + stepName}); + } + + // While we are building sorted sequences of k' elements, the distance + // at which we change direction is the same as the merge distance, + // otherwise the direction should always be descending as we always + // discard the higher k' elements. + const auto changeDirDistance = + mergeStep < logK ? mergeDistance : 1u << (logN - 1); + compareAndSwapAtDistance(graph, prog, t, other, mergeDistance * b, + changeDirDistance * b, mergeOrder, nThisStep * b, + {dnai, stepName}); + + t = toCanonicalOrder(graph, t, mergeDistance * b, nThisStep * b); + if (other) { + other = toCanonicalOrder(graph, *other, mergeDistance * b, nThisStep * b); + } + + // If we're done building sequences of k' elements, and we still have + // more than k' elements, discard the upper k' elements of each + // pair of k' sized sequences as by this point we have imposed an + // ordering between them. + if (logMergeDistance >= logK && n > kDash) { + const auto kDashMultiples = floordiv(n, 2 * kDash); + const auto remainder = n - kDashMultiples * 2 * kDash; + const auto t2d = t.reshape({n, b}); + const auto evenPart = + t2d.slice(0, kDashMultiples * 2 * kDash) + .reshapePartial(0, 1, {kDashMultiples, 2 * kDash}); + const auto oddPart = t2d.slice(kDashMultiples * 2 * kDash, n); + t = concat(evenPart.slice(0, kDash, 1).flatten(0, 2), + oddPart.slice(0, std::min(remainder, kDash))) + .flatten(); + if (other) { + const auto t2d = other->reshape({n, b}); + const auto evenPart = + t2d.slice(0, kDashMultiples * 2 * kDash) + .reshapePartial(0, 1, {kDashMultiples, 2 * kDash}); + const auto oddPart = t2d.slice(kDashMultiples * 2 * kDash, n); + other = concat(evenPart.slice(0, kDash, 1).flatten(0, 2), + oddPart.slice(0, std::min(remainder, kDash))) + .flatten(); + } + n = nThisStep = kDashMultiples * kDash + std::min(remainder, kDash); + } + + for (unsigned sortStep = 0; sortStep < logMergeDistance; ++sortStep) { + const auto stepName = + "Sort" + std::to_string(logMergeDistance - sortStep - 1); + + const auto sortDistance = 1u << (logMergeDistance - sortStep - 1); + // Once we have built sorted sequences of k' elements, we keep + // merging and re-sorting so the distance at which we change + // directions once we start discarding is always k'/2. + const auto changeDirDistance = + std::min(mergeDistance, (1u << (logK - 1))); + + const bool sortOrder = [&] { + bool order; + if (mergeStep < logK) { + // If we are still building sorted sequences of k' elements, the + // sort order is the same as the merge order + order = mergeOrder; + } else { + if (mergeStep == logN - 1) { + // If this is the last step then sort into the final desired + // order. + order = ascendingOrder; + } else { + // Otherwise sort into the order needed to correctly merge + // sequences of k' elements. + order = !mergeKSequencesOrder; + } + } + return order; + }(); + + t = rearrangeForStep(graph, prog, t, sortDistance * b, nThisStep * b, + tCache, {dnai, "keys" + stepName}); + if (other) { + other = rearrangeForStep(graph, prog, *other, sortDistance * b, + nThisStep * b, otherCache, + {dnai, "values" + stepName}); + } + + compareAndSwapAtDistance(graph, prog, t, other, sortDistance * b, + changeDirDistance * b, sortOrder, nThisStep * b, + {dnai, stepName}); + t = toCanonicalOrder(graph, t, sortDistance * b, nThisStep * b); + if (other) { + other = + toCanonicalOrder(graph, *other, sortDistance * b, nThisStep * b); + } + + // If we have finished building sequences of k' elements and + // there are more than k elements remaining, we can discard descending + // powers of 2 during the re-sort until we are left with exactly k + // elements. + const auto lastKDashElements = ((n - 1) % kDash) + 1; + // We either discard elements from the start or end of the last k' + // elements depending on whether we want the top or bottom k elements + // and depending on the sort order and offset into the array when it + // was sorted. + const bool fromStart = + sortOrder ^ !largest ^ bool(((n - lastKDashElements) / kDash) & 1u); + const auto nToDiscard = + fromStart ? sortDistance + : ((lastKDashElements - 1) % sortDistance) + 1; + if (logMergeDistance + 1 >= logK && lastKDashElements >= nToDiscard && + lastKDashElements - nToDiscard >= k) { + std::vector slices; + if (fromStart) { + slices.emplace_back(0, n - lastKDashElements); + slices.emplace_back(n - lastKDashElements + nToDiscard, n); + } else { + slices.emplace_back(0, n - nToDiscard); + } + const auto t2d = t.reshape({n, b}); + t = concat(t2d.slices(slices)).flatten(); + if (other) { + const auto t2d = other->reshape({n, b}); + other = concat(t2d.slices(slices)).flatten(); + } + n -= nToDiscard; + nThisStep = n; + } + } + } + + if (inputType != dataType) { + t = cast(graph, t, inputType, prog, dnai); + } + + t = t.reshape({k, b}).transpose().reshape(outputShape); + if (other) { + other = other->reshape({k, b}).transpose().reshape(outputShape); + } + return std::make_pair(std::move(t), other.value_or(Tensor{})); +} + +} // end namespace bitonic +} // end namespace popops diff --git a/lib/popops/BitonicTopK.hpp b/lib/popops/BitonicTopK.hpp new file mode 100644 index 00000000..c20d9bac --- /dev/null +++ b/lib/popops/BitonicTopK.hpp @@ -0,0 +1,34 @@ +// Copyright (c) 2020 Graphcore Ltd. All rights reserved. + +#ifndef _popops_BitonicTopK_hpp_ +#define _popops_BitonicTopK_hpp_ + +#include +#include + +#include +#include +#include +#include + +namespace popops { +namespace bitonic { + +poplar::Tensor createTopKInputImpl(poplar::Graph &graph, + const poplar::Type &type, + const std::vector &shape, + const poplar::DebugNameAndId &dnai = {}); + +/// Implementation of topK using bitonic sort based method. +/// Returns a pair of top k values in t, and matching permutation +/// of \p other if it was given. +std::pair +topKImpl(poplar::Graph &graph, poplar::program::Sequence &prog, + const poplar::Tensor &t, const std::optional &other, + const unsigned k, const bool largest, const bool sorted, + const bool ascendingOrder, const poplar::DebugNameAndId &dnai = {}); + +} // end namespace bitonic +} // end namespace popops + +#endif // _popops_BitonicTopK_hpp_ diff --git a/lib/popops/CMakeLists.txt b/lib/popops/CMakeLists.txt index 9f6812ab..88c1e771 100644 --- a/lib/popops/CMakeLists.txt +++ b/lib/popops/CMakeLists.txt @@ -5,6 +5,8 @@ get_target_property(POPC_EXECUTABLE popc_bin LOCATION) add_library(popops SHARED AllTrue.cpp + BitonicTopK.cpp + BitonicTopK.hpp Cast.cpp CircBuf.cpp TensorCollectives.cpp @@ -36,7 +38,9 @@ add_library(popops SHARED SelectScalarFromRows.cpp SequenceSlice.cpp Sort.cpp + SortOrder.cpp SparseUtils.cpp + TopK.cpp UpdateScalarInRows.cpp VarianceToOrFromInvStdDev.cpp Zero.cpp @@ -61,6 +65,8 @@ add_library(popops SHARED ${CMAKE_SOURCE_DIR}/include/popops/Rearrange.hpp ${CMAKE_SOURCE_DIR}/include/popops/SequenceSlice.hpp ${CMAKE_SOURCE_DIR}/include/popops/Reduce.hpp + ${CMAKE_SOURCE_DIR}/include/popops/SortOrder.hpp + ${CMAKE_SOURCE_DIR}/include/popops/TopK.hpp ${CMAKE_SOURCE_DIR}/include/popops/Zero.hpp ${CMAKE_SOURCE_DIR}/include/poplibs_support/popopsPerformanceEstimation.hpp @@ -175,6 +181,7 @@ add_gp_library( ${CMAKE_CURRENT_SOURCE_DIR}/codelets/BroadcastVectorInnerSupervisor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CircBufIncrIndex.cpp ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CircOffset.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CompareAndSwapAtDistance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/codelets/ContinuousReduce.cpp ${CMAKE_CURRENT_SOURCE_DIR}/codelets/DynamicSlice1d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/codelets/DynamicSlice2d.cpp diff --git a/lib/popops/ElementWise.cpp b/lib/popops/ElementWise.cpp index 76ce8228..060fde02 100644 --- a/lib/popops/ElementWise.cpp +++ b/lib/popops/ElementWise.cpp @@ -420,6 +420,8 @@ unsigned maxVertexElementsPerRegion(const Target &target, const Type &outType, return 2; } else if (eType == FLOAT) { return 3; + } else if (eType == UNSIGNED_INT) { + return 4; } else { throw poplibs_error("Requested type to index conversion doesn't exist"); } @@ -428,22 +430,24 @@ unsigned maxVertexElementsPerRegion(const Target &target, const Type &outType, /* Assembler codelet implementations indicate how many elements are processed * per HW loop. If HW loop isn't in use or a codelet has only C implementation * them UINT_MAX shall be returned */ - constexpr unsigned convMap[2][NR_OF_CODELETS][4] = { + constexpr unsigned convMap[2][NR_OF_CODELETS][5] = { { // None inPlace - {0, UINT_MAX, 2, 1}, // Clamp - {0, UINT_MAX, 2, 1}, // BroadcastClamp - {UINT_MAX, 1, UINT_MAX, 1}, // Select - {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastSelect - {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX} // BroadcastSelectorSelect + {0, UINT_MAX, 2, 1, UINT_MAX}, // Clamp + {0, UINT_MAX, 2, 1, UINT_MAX}, // BroadcastClamp + {UINT_MAX, 1, UINT_MAX, 1, UINT_MAX}, // Select + {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastSelect + {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, + UINT_MAX} // BroadcastSelectorSelect }, { // inPlace - {0, UINT_MAX, UINT_MAX, UINT_MAX}, // Clamp - {0, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastClamp - {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // Select - {0, 0, 0, 0}, // BroadcastSelect - {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX} // BroadcastSelectorSelect + {0, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // Clamp + {0, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastClamp + {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // Select + {0, 0, 0, 0, 0}, // BroadcastSelect + {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, + UINT_MAX} // BroadcastSelectorSelect }}; unsigned inPlaceIdx = static_cast(inPlace); diff --git a/lib/popops/SortOrder.cpp b/lib/popops/SortOrder.cpp new file mode 100644 index 00000000..ffc4cd4d --- /dev/null +++ b/lib/popops/SortOrder.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2021 Graphcore Ltd. All rights reserved. + +#include +#include + +#include + +namespace popops { + +std::ostream &operator<<(std::ostream &os, const SortOrder &o) { + switch (o) { + case SortOrder::NONE: + os << "none"; + break; + case SortOrder::ASCENDING: + os << "ascending"; + break; + case SortOrder::DESCENDING: + os << "descending"; + break; + default: + throw poputil::poplibs_error("Unhandled sort order"); + } + return os; +} + +} // end namespace popops diff --git a/lib/popops/TopK.cpp b/lib/popops/TopK.cpp new file mode 100644 index 00000000..f08e23c8 --- /dev/null +++ b/lib/popops/TopK.cpp @@ -0,0 +1,126 @@ +// Copyright (c) 2020 Graphcore Ltd. All rights reserved. + +#include + +#include + +#include +#include + +#include +#include + +#include "BitonicTopK.hpp" + +using namespace poplar; +using namespace poplar::program; +using namespace poplibs_support; +using namespace poputil; + +namespace poputil { + +template <> ProfileValue toProfileValue(const popops::TopKParams &p) { + ProfileValue::Map v; + v.emplace("k", toProfileValue(p.k)); + v.emplace("largest", toProfileValue(p.largest)); + v.emplace("sortOrder", toProfileValue(p.sortOrder)); + return v; +} + +} // namespace poputil + +namespace popops { + +TopKParams::TopKParams(unsigned k, bool largest, SortOrder sortOrder) noexcept + : k(k), largest(largest), sortOrder(sortOrder) {} + +std::ostream &operator<<(std::ostream &os, const TopKParams &p) { + os << "{" + << "k=" << p.k << ", largest=" << (p.largest ? "true" : "false") + << ", sortOrder=" << p.sortOrder << "}"; + return os; +} + +Tensor createTopKInput(Graph &graph, const Type &type, + const std::vector &shape, + const TopKParams ¶ms, + const DebugContext &debugContext) { + logging::popops::info("createTopKInput(shape={}, params={}, debugPath='{}')", + shape, params, debugContext.getPathName()); + poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(shape, params), + "createTopKInput"); + const auto t = bitonic::createTopKInputImpl(graph, type, shape, {di}); + di.addOutput(t); + return t; +} + +Tensor topK(Graph &graph, Sequence &prog, const Tensor &t, + const TopKParams ¶ms, const DebugContext &debugContext) { + logging::popops::info("topK(shape={}, params={}, debugPath='{}')", t.shape(), + params, debugContext.getPathName()); + poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(t, params), "topK"); + const bool sorted = params.sortOrder != SortOrder::NONE; + const bool ascending = params.sortOrder != SortOrder::DESCENDING; + const auto result = bitonic::topKImpl(graph, prog, t, std::nullopt, params.k, + params.largest, sorted, ascending, {di}) + .first; + di.addOutput(result); + return result; +} + +std::pair topKKeyValue(Graph &graph, Sequence &prog, + const Tensor &key, const Tensor &value, + const TopKParams ¶ms, + const DebugContext &debugContext) { + logging::popops::info("topKKeyValue(shape={}, params={}, debugPath='{}')", + key.shape(), params, debugContext.getPathName()); + poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(key, value, params), + "topKKeyValue"); + const bool sorted = params.sortOrder != SortOrder::NONE; + const bool ascending = params.sortOrder != SortOrder::DESCENDING; + const auto result = + bitonic::topKImpl(graph, prog, key, value, params.k, params.largest, + sorted, ascending, {di}); + di.addOutput(result.first); + di.addOutput(result.second); + return result; +} + +std::pair +topKWithPermutation(Graph &graph, Sequence &prog, const Tensor &t, + const TopKParams ¶ms, + const DebugContext &debugContext) { + logging::popops::info( + "topKWithPermutation(shape={}, params={}, debugPath='{}')", t.shape(), + params, debugContext.getPathName()); + poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(t, params)); + const bool sorted = params.sortOrder != SortOrder::NONE; + const bool ascending = params.sortOrder != SortOrder::DESCENDING; + + const auto numElems = t.numElements(); + const auto n = t.dim(t.rank() - 1); + const auto b = numElems / t.dim(t.rank() - 1); + const auto indicesToPermute = + createTopKInput(graph, UNSIGNED_INT, t.shape(), params, {di, "indices"}); + if (b == 1) { + iota(graph, indicesToPermute.flatten(), 0u, prog, {di}); + } else { + // T34944, a 2-dimensional iota API with a second dimension in which to + // broadcast would be useful here. + std::vector singleBatchIndices(n); + std::iota(singleBatchIndices.begin(), singleBatchIndices.end(), 0); + const auto iota = + graph.addConstant(UNSIGNED_INT, {1, n}, ArrayRef(singleBatchIndices), + {di, "indicesInitializer"}); + poputil::mapTensorLinearly(graph, iota); + prog.add(Copy(iota.broadcast(b, 0).flatten(), indicesToPermute.flatten())); + } + const auto result = + bitonic::topKImpl(graph, prog, t, indicesToPermute, params.k, + params.largest, sorted, ascending, {di}); + di.addOutput(result.first); + di.addOutput(result.second); + return result; +} + +} // end namespace popops diff --git a/lib/popops/codelets/CompareAndSwapAtDistance.cpp b/lib/popops/codelets/CompareAndSwapAtDistance.cpp new file mode 100644 index 00000000..808c2ef3 --- /dev/null +++ b/lib/popops/codelets/CompareAndSwapAtDistance.cpp @@ -0,0 +1,184 @@ +// Copyright (c) 2020 Graphcore Ltd. All rights reserved. +#include +#include +#include + +#include "poplibs_support/ExternalCodelet.hpp" +#include "poplibs_support/TileConstants.hpp" + +using namespace poplar; + +static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR; +static constexpr auto DELTAN = poplar::VectorListLayout::DELTANELEMENTS; + +namespace popops { + +namespace { + +using WorklistType = unsigned short; + +} // end anonymous namespace + +template constexpr static inline T min(const T &a, const T &b) { + return a < b ? a : b; +} + +// The format of the worklists for each worker is as follows in order as they +// appear in memory. +// +// [numEntriesM1] A count for the number of entries that will follow, +// minus 1. +// the always-present info in the worklist. +// [initialOffset] An offset in groups into the indices/values that +// this worker will start at. +// [initialOrder/initialCount] 2 entries to form a 32-bit packed field. +// [0-1: initialDirection] The initial direction for this worker +// [1-32: initialCount] The initial count for the counter that tracks +// changes of direction. +// [firstInnerElemCount] Because the first entry for a worker may start +// at a weird offset, the element count for the first +// inner loop is given in the worklist. +// numEntries * { +// [distance] The distance at which to do compare and swap. +// [elemCount] The total number of elements to compare and swap +// for this entry. +// } +template +static void workerCompute(unsigned wid, Impl impl, const WorklistType *worklist, + unsigned distanceToChangeOrder) { + const auto numEntriesM1 = *worklist++; + auto numEntries = numEntriesM1 + 1; + const auto initialOffset = *worklist++; + impl.increment(initialOffset); + auto *worklistUnsigned = reinterpret_cast(worklist); + const auto packedOrderAndCount = *worklistUnsigned++; + worklist = reinterpret_cast(worklistUnsigned); + bool order = packedOrderAndCount & 1u; + const auto initialCount = packedOrderAndCount >> 1u; + auto changeOrderCounter = distanceToChangeOrder - initialCount; + + unsigned innerElemCount = *worklist++; + // numEntries should never be 0 + unsigned distance = *worklist++; + unsigned numElems = *worklist++; + + while (numEntries-- > 0) { + while (numElems != 0) { + numElems -= innerElemCount; + changeOrderCounter -= innerElemCount; + + // This will be a rpt loop. + while (innerElemCount-- > 0) { + impl.compareAndSwap(distance, order); + impl.increment(1); + } + impl.increment(distance); + innerElemCount = min(distance, numElems); + if (changeOrderCounter == 0) { + order = !order; + changeOrderCounter = distanceToChangeOrder; + } + } + // We will overread here but only by 2 worklist elements i.e. + // 4 bytes. + distance = *worklist++; + numElems = *worklist++; + innerElemCount = min(distance, numElems); + } +} + +template constexpr inline bool hasAssemblyVersion() { + return false; +} + +template struct KeyImpl { + KeyType *keys; + void compareAndSwap(unsigned distance, bool order) { + if (order == (keys[0] > keys[distance])) { + std::swap(keys[0], keys[distance]); + } + } + void increment(unsigned n) { keys += n; } +}; + +template +class CompareAndSwapAtDistance + : public SupervisorVertexIf() && + ASM_CODELETS_ENABLED> { +public: + InOut> keys; + + // Outer dimension of array indexed by worker context id + Input> worklists; + + // Used with the logical offset to determine the direction of comparison for + // a given comparison. + unsigned distanceToChangeOrder; + + IS_EXTERNAL_CODELET((hasAssemblyVersion())); + + bool compute() { + const auto numWorkers = worklists.size(); + KeyImpl impl = {&keys[0]}; + for (unsigned wid = 0; wid < numWorkers; ++wid) { + const WorklistType *worklist = &worklists[wid][0]; + workerCompute(wid, impl, worklist, distanceToChangeOrder); + } + return true; + } +}; + +template class CompareAndSwapAtDistance; + +template +constexpr inline bool hasAssemblyVersionKeyVal() { + return false; +} + +template struct KeyValImpl { + KeyType *keys; + ValueType *values; + void compareAndSwap(unsigned distance, bool order) { + if (order == (keys[0] > keys[distance])) { + std::swap(keys[0], keys[distance]); + std::swap(values[0], values[distance]); + } + } + void increment(unsigned n) { + keys += n; + values += n; + } +}; + +template +class CompareAndSwapAtDistanceKeyVal + : public SupervisorVertexIf< + hasAssemblyVersionKeyVal() && + ASM_CODELETS_ENABLED> { +public: + InOut> keys; + InOut> values; + + // Outer dimension of array indexed by worker context id + Input> worklists; + + // Used with the logical offset to determine the direction of comparison for + // a given comparison. + unsigned distanceToChangeOrder; + + IS_EXTERNAL_CODELET((hasAssemblyVersionKeyVal())); + + bool compute() { + const auto numWorkers = worklists.size(); + KeyValImpl impl = {&keys[0], &values[0]}; + for (unsigned wid = 0; wid < numWorkers; ++wid) { + const WorklistType *worklist = &worklists[wid][0]; + workerCompute(wid, impl, worklist, distanceToChangeOrder); + } + return true; + } +}; + +template class CompareAndSwapAtDistanceKeyVal; + +} // end namespace popops diff --git a/lib/popops/codelets/HeapSortVertexKV.cpp b/lib/popops/codelets/HeapSortVertexKV.cpp index d58bf8c4..327ec9ca 100644 --- a/lib/popops/codelets/HeapSortVertexKV.cpp +++ b/lib/popops/codelets/HeapSortVertexKV.cpp @@ -126,12 +126,17 @@ class HeapSortVertexKV : public poplar::Vertex { template class HeapSortVertexKV; template class HeapSortVertexKV; +template class HeapSortVertexKV; template class HeapSortVertexKV; template class HeapSortVertexKV; template class HeapSortVertexKV; template class HeapSortVertexKV; +template class HeapSortVertexKV; +template class HeapSortVertexKV; +template class HeapSortVertexKV; template class HeapSortVertexKV; template class HeapSortVertexKV; +template class HeapSortVertexKV; template class HeapSortVertexKV; } // namespace popops diff --git a/lib/popops/codelets/asm/BroadcastSelect.S b/lib/popops/codelets/asm/BroadcastSelect.S index 0a28ae97..1965bee0 100644 --- a/lib/popops/codelets/asm/BroadcastSelect.S +++ b/lib/popops/codelets/asm/BroadcastSelect.S @@ -14,17 +14,20 @@ #define BCAST_SELECT_FLOAT __runCodelet_popops__BroadcastSelect___float #define BCAST_SELECT_INT __runCodelet_popops__BroadcastSelect___int +#define BCAST_SELECT_UNSIGNED_INT __runCodelet_popops__BroadcastSelect___unsigned_int #define BCAST_SELECT_HALF __runCodelet_popops__BroadcastSelect___half #define BCAST_SELECT_BOOL __runCodelet_popops__BroadcastSelect___bool #define BCAST_SELECTOR_SELECT_HALF __runCodelet_popops__BroadcastSelectorSelect___half #define BCAST_SELECTOR_SELECT_FLOAT __runCodelet_popops__BroadcastSelectorSelect___float #define BCAST_SELECTOR_SELECT_INT __runCodelet_popops__BroadcastSelectorSelect___int +#define BCAST_SELECTOR_SELECT_UNSIGNED_INT __runCodelet_popops__BroadcastSelectorSelect___unsigned_int #define BCAST_SELECTOR_SELECT_BOOL __runCodelet_popops__BroadcastSelectorSelect___bool #define BCAST_SELECTOR_SELECT_INPLACE_HALF __runCodelet_popops__BroadcastSelectorSelectInPlace___half #define BCAST_SELECTOR_SELECT_INPLACE_FLOAT __runCodelet_popops__BroadcastSelectorSelectInPlace___float #define BCAST_SELECTOR_SELECT_INPLACE_INT __runCodelet_popops__BroadcastSelectorSelectInPlace___int +#define BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT __runCodelet_popops__BroadcastSelectorSelectInPlace___unsigned_int #define BCAST_SELECTOR_SELECT_INPLACE_BOOL __runCodelet_popops__BroadcastSelectorSelectInPlace___bool @@ -34,6 +37,7 @@ .endm EXPORT_FN BCAST_SELECT_INT +EXPORT_FN BCAST_SELECT_UNSIGNED_INT EXPORT_FN BCAST_SELECT_FLOAT EXPORT_FN BCAST_SELECT_HALF EXPORT_FN BCAST_SELECT_BOOL @@ -41,11 +45,13 @@ EXPORT_FN BCAST_SELECT_BOOL EXPORT_FN BCAST_SELECTOR_SELECT_HALF EXPORT_FN BCAST_SELECTOR_SELECT_FLOAT EXPORT_FN BCAST_SELECTOR_SELECT_INT +EXPORT_FN BCAST_SELECTOR_SELECT_UNSIGNED_INT EXPORT_FN BCAST_SELECTOR_SELECT_BOOL EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_HALF EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_FLOAT EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_INT +EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_BOOL @@ -87,6 +93,7 @@ DEF_STACK_USAGE 0 .text.BCAST_SELECT_FLOAT .align 8 BCAST_SELECT_INT: +BCAST_SELECT_UNSIGNED_INT: BCAST_SELECT_FLOAT: // Load the vertex state. Bail out straight away if no rows to do ld32 $nRows, $mvertex_base, $mzero, VERTEX_OUT_ROWS_OFFSET @@ -544,6 +551,7 @@ DEF_STACK_SIZE_OWN 0 .text.BCAST_SELECTOR_SELECT_FLOAT // ---- InPlace entry points BCAST_SELECTOR_SELECT_INPLACE_FLOAT: BCAST_SELECTOR_SELECT_INPLACE_INT: +BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT: setzi $BYTES_PER_WORD, 4 bri 1f BCAST_SELECTOR_SELECT_INPLACE_BOOL: @@ -565,6 +573,7 @@ BCAST_SELECTOR_SELECT_INPLACE_HALF: // --- Non-inplace entry points. BCAST_SELECTOR_SELECT_FLOAT: BCAST_SELECTOR_SELECT_INT: +BCAST_SELECTOR_SELECT_UNSIGNED_INT: setzi $BYTES_PER_WORD, 4 bri 2f BCAST_SELECTOR_SELECT_BOOL: diff --git a/lib/popops/codelets/asm/select_int_float.S b/lib/popops/codelets/asm/select_int_float.S index c33384d8..485c28ee 100644 --- a/lib/popops/codelets/asm/select_int_float.S +++ b/lib/popops/codelets/asm/select_int_float.S @@ -4,11 +4,15 @@ #include "poplar/StackSizeDefs.hpp" #define SELECT_INT __runCodelet_popops__Select___int +#define SELECT_UNSIGNED_INT __runCodelet_popops__Select___unsigned_int #define SELECT_FLOAT __runCodelet_popops__Select___float .globl SELECT_INT .type SELECT_INT, @function +.globl SELECT_UNSIGNED_INT +.type SELECT_UNSIGNED_INT, @function + .globl SELECT_FLOAT .type SELECT_FLOAT, @function @@ -40,6 +44,7 @@ DEF_STACK_USAGE 0 .text.SELECT_INT nop // Required for alignment of rpt SELECT_INT: +SELECT_UNSIGNED_INT: SELECT_FLOAT: // Load the vertex state. ld32 $in1, $mvertex_base, $mzero, VERTEX_IN1_OFFSET diff --git a/lib/popops/codelets/elemwiseMiscCodelets.cpp b/lib/popops/codelets/elemwiseMiscCodelets.cpp index cc33a081..2cedeb6a 100644 --- a/lib/popops/codelets/elemwiseMiscCodelets.cpp +++ b/lib/popops/codelets/elemwiseMiscCodelets.cpp @@ -459,6 +459,7 @@ template class Select : public Vertex { template class Select; template class Select; template class Select; +template class Select; template class Select; template class BroadcastClamp : public Vertex { @@ -517,6 +518,7 @@ template class BroadcastSelect : public Vertex { template class BroadcastSelect; template class BroadcastSelect; template class BroadcastSelect; +template class BroadcastSelect; template class BroadcastSelect; // 'Select' ternary operator where the selector (boolean third operand) is a @@ -544,6 +546,7 @@ template class BroadcastSelectorSelect : public Vertex { template class BroadcastSelectorSelect; template class BroadcastSelectorSelect; template class BroadcastSelectorSelect; +template class BroadcastSelectorSelect; template class BroadcastSelectorSelect; template class ClampInPlace : public Vertex { @@ -615,6 +618,7 @@ template class SelectInPlace : public Vertex { template class SelectInPlace; template class SelectInPlace; template class SelectInPlace; +template class SelectInPlace; template class SelectInPlace; template @@ -640,6 +644,7 @@ class BroadcastSelectorSelectInPlace : public Vertex { template class BroadcastSelectorSelectInPlace; template class BroadcastSelectorSelectInPlace; template class BroadcastSelectorSelectInPlace; +template class BroadcastSelectorSelectInPlace; template class BroadcastSelectorSelectInPlace; template class Histogram2D : public Vertex { diff --git a/lib/popops/popopsCycleEstimators.cpp b/lib/popops/popopsCycleEstimators.cpp index 4e886ec1..2a526b0a 100644 --- a/lib/popops/popopsCycleEstimators.cpp +++ b/lib/popops/popopsCycleEstimators.cpp @@ -2821,6 +2821,20 @@ VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(TransposeSupervisor)( return 7 + 6 * maxCycles; } +VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(CompareAndSwapAtDistance)( + const VertexIntrospector &vertex, const Target &target, + const Type &keyType) { + // TODO: + return 0; +} + +VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(CompareAndSwapAtDistanceKeyVal)( + const VertexIntrospector &vertex, const Target &target, const Type &keyType, + const Type &valueType) { + // TODO: + return 0; +} + #define BROADCAST_2TYPE_CYCLE_ESTIM_ENTRIES(vertexName) \ CYCLE_ESTIMATOR_ENTRY(popops, vertexName, \ BinaryOpType::VARIANCE_TO_INV_STD_DEV, FLOAT, HALF), \ @@ -3106,26 +3120,32 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() { CYCLE_ESTIMATOR_ENTRY(popops, Select, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, Select, HALF), CYCLE_ESTIMATOR_ENTRY(popops, Select, INT), + CYCLE_ESTIMATOR_ENTRY(popops, Select, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, Select, BOOL), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, HALF), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, INT), + CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, BOOL), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, HALF), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, INT), + CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, BOOL), CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, HALF), CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, INT), + CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, BOOL), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, HALF), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, INT), + CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, + UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, BOOL), CYCLE_ESTIMATOR_ENTRY(popops, Histogram2D, FLOAT, true), @@ -3193,9 +3213,11 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() { CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, INT, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, INT, HALF), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, INT), + CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, HALF), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, INT), + CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, UNSIGNED_INT), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, FLOAT), CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, HALF), @@ -3236,7 +3258,10 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() { CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, HALF), CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, UNSIGNED_SHORT), CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, SHORT), - }; + + CYCLE_ESTIMATOR_ENTRY(popops, CompareAndSwapAtDistance, FLOAT), + CYCLE_ESTIMATOR_ENTRY(popops, CompareAndSwapAtDistanceKeyVal, FLOAT, + UNSIGNED_INT)}; for (const auto &entry : unaryOpPerfInfo) { table.push_back(CYCLE_ESTIMATOR_ENTRY(popops, UnaryOp2D, entry.first.first, diff --git a/tests/popops/CMakeLists.txt b/tests/popops/CMakeLists.txt index 3b74ffb3..5f7c02ba 100644 --- a/tests/popops/CMakeLists.txt +++ b/tests/popops/CMakeLists.txt @@ -720,9 +720,14 @@ add_std_operators_test(SelectFloatRHSConst) add_std_operators_test(SelectFloatLHSAndRHSConst) add_std_operators_test(SelectHalfLHSAndRHSConst) add_std_operators_test(SelectInt) +add_std_operators_test(SelectUInt) +add_std_operators_test(SelectInPlaceInt) +add_std_operators_test(SelectInPlaceUInt) add_std_operators_test(BroadcastSelectorSelectInt) +add_std_operators_test(BroadcastSelectorSelectUInt) add_std_operators_test(BroadcastSelectorSelectFloat) add_std_operators_test(BroadcastSelectorSelectInPlaceInt) +add_std_operators_test(BroadcastSelectorSelectInPlaceUInt) add_std_operators_test(BroadcastSelectorSelectInPlaceFloat) add_std_operators_test(ClampFloat) add_std_operators_test(ClampFloatMinConst) @@ -1098,90 +1103,227 @@ foreach(dType half float) endforeach() endforeach() -add_multitarget_test(NAME topk_f32_u32_n1000_k1000_sorted - COMMAND topk - --n 1000 - --k 1000 - --data-type float - --index-type uint - --sort-output=1 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f32_u32_n1000_k100_sorted - COMMAND topk - --n 1000 - --k 100 - --data-type float - --index-type uint - --sort-output=1 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f32_u32_n1000_k1 - COMMAND topk - --n 1000 - --k 100 - --data-type float - --index-type uint - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f32_u32_n1000_k1000_unsorted - COMMAND topk - --n 1000 - --k 1000 - --data-type float - --index-type uint - --sort-output=0 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f32_u32_n1000_k100_unsorted - COMMAND topk - --n 1000 - --k 100 - --data-type float - --index-type uint - --sort-output=0 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f16_u32_n1000_k1000_sorted - COMMAND topk - --n 1000 - --k 1000 - --data-type half - --index-type uint - --sort-output=1 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f16_u32_n1000_k100_sorted - COMMAND topk - --n 1000 - --k 100 - --data-type half - --index-type uint - --sort-output=1 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f16_u32_n1000_k1 - COMMAND topk - --n 1000 - --k 100 - --data-type half - --index-type uint - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f16_u32_n1000_k1000_unsorted - COMMAND topk - --n 1000 - --k 1000 - --data-type half - --index-type uint - --sort-output=0 - --tiles-per-ipu=16) - -add_multitarget_test(NAME topk_f16_u32_n1000_k100_unsorted - COMMAND topk - --n 1000 - --k 100 - --data-type half - --index-type uint - --sort-output=0 - --tiles-per-ipu=16) + +foreach(DATA_TYPE half float) + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_b0_n1000_k100_edge_case + COMMAND topk + --n 1000 + --k 100 + --b 0 + --data-type ${DATA_TYPE} + --index-type uint + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + foreach(BATCH_SIZE 1 3) + add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1000_sorted + COMMAND topk + --n 1000 + --k 1000 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --sort-order=descending + --api=popnn + --tiles-per-ipu=4) + + add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k100_sorted + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --sort-order=descending + --api=popnn + --tiles-per-ipu=4) + + add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1 + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --api=popnn + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1000_unsorted + COMMAND topk + --n 1000 + --k 1000 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --sort-order=none + --api=popnn + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k100_unsorted + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --sort-order=none + --api=popnn + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + endforeach() + + foreach(INDICES 0 1) + foreach(BATCH_SIZE 1 3) + foreach(SORT_ORDER ascending descending) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1_k1_${SORT_ORDER}_edge_case + COMMAND topk + --n 1 + --k 1 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --sort-order=${SORT_ORDER} + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1000_${SORT_ORDER} + COMMAND topk + --n 1000 + --k 1000 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --sort-order=${SORT_ORDER} + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k1234_${SORT_ORDER} + COMMAND topk + --n 1234 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --sort-order=${SORT_ORDER} + --tiles-per-ipu=4) + + foreach(LARGEST 0 1) + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k100_largest${LARGEST}_${SORT_ORDER} + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --largest ${LARGEST} + --sort-order=${SORT_ORDER} + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k123_largest${LARGEST}_${SORT_ORDER} + COMMAND topk + --n 1234 + --k 123 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --largest ${LARGEST} + --sort-order=${SORT_ORDER} + --tiles-per-ipu=4) + endforeach() + endforeach() + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k0_edge_case + COMMAND topk + --n 1000 + --k 0 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1000_unsorted + COMMAND topk + --n 1000 + --k 1000 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --sort-order=none + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k1234_unsorted + COMMAND topk + --n 1234 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --sort-order=none + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + foreach(LARGEST 0 1) + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k100_largest${LARGEST}_unsorted + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --largest ${LARGEST} + --sort-order=none + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1_largest${LARGEST} + COMMAND topk + --n 1000 + --k 100 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --largest ${LARGEST} + --api=popops + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + + add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k123_largest${LARGEST}_unsorted + COMMAND topk + --n 1234 + --k 123 + --batch-size ${BATCH_SIZE} + --data-type ${DATA_TYPE} + --index-type uint + --return-indices ${INDICES} + --api=popops + --largest ${LARGEST} + --sort-order=none + --tiles-per-ipu=4 + VARIANTS ${IPUMODEL_VARIANTS}) + endforeach() + endforeach() + endforeach() +endforeach() diff --git a/tests/popops/SortTest.cpp b/tests/popops/SortTest.cpp index f7bc5b5b..0e3f965f 100644 --- a/tests/popops/SortTest.cpp +++ b/tests/popops/SortTest.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -403,3 +404,104 @@ BOOST_AUTO_TEST_CASE(DeviceSortKVInt) { BOOST_CHECK(std::is_sorted(begin, end)); } } + +BOOST_AUTO_TEST_CASE(DeviceSortKVFloatUInt) { + std::array key; + std::array value; + boost::random::mt19937 gen; + boost::random::uniform_real_distribution<> dist(-1024, 1024); + std::generate(std::begin(key), std::end(key), std::bind(dist, gen)); + std::iota(value.begin(), value.end(), 0); + auto out = deviceSortKV(key, value); + + // Check that we have the same elements in some order + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order + std::array keyPermuted; + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + BOOST_CHECK(std::is_sorted(std::begin(keyPermuted), std::begin(keyPermuted))); + + out = deviceSortKV(key, value, {4, 4, 4}, 2); + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order on the specified dimension + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + const auto begin = keyPermuted.data() + (i * 16 + j * 4); + const auto end = keyPermuted.data() + (i * 16 + (j + 1) * 4); + + BOOST_CHECK(std::is_sorted(begin, end)); + } + } + + out = deviceSortKV(key, value, {4, 4, 4}, 1); + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order on the specified dimension + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + BOOST_CHECK(keyPermuted[i * 16 + j * 4 + k] <= + keyPermuted[i * 16 + (j + 1) * 4 + k]); + } + } + } + + out = deviceSortKV(key, value, {4, 4, 4}, 0); + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order on the specified dimension + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 4; ++k) { + BOOST_CHECK(keyPermuted[i * 16 + j * 4 + k] <= + keyPermuted[(i + 1) * 16 + j * 4 + k]); + } + } + } + + out = deviceSortKV(key, value, {16, 4}, 0); + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order on the specified dimension + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + for (int i = 0; i < 15; ++i) { + for (int j = 0; j < 4; ++j) { + BOOST_CHECK(keyPermuted[i * 4 + j] <= keyPermuted[(i + 1) * 4 + j]); + } + } + + out = deviceSortKV(key, value, {16, 4}, 1); + BOOST_CHECK( + std::is_permutation(std::begin(value), std::end(value), std::begin(out))); + + // Check that the elements are in sorted order on the specified dimension + for (std::size_t i = 0; i < out.size(); ++i) { + keyPermuted[i] = key[out[i]]; + } + for (int i = 0; i < 15; ++i) { + const auto begin = keyPermuted.data() + (i * 4); + const auto end = keyPermuted.data() + ((i + 1) * 4); + + BOOST_CHECK(std::is_sorted(begin, end)); + } +} diff --git a/tests/popops/StdOperatorsTest.cpp b/tests/popops/StdOperatorsTest.cpp index 15555799..3e1d91bd 100644 --- a/tests/popops/StdOperatorsTest.cpp +++ b/tests/popops/StdOperatorsTest.cpp @@ -186,6 +186,26 @@ static void setBinaryOpInputs(int hIn1[DIM_SIZE][DIM_SIZE], } } +static void setBinaryOpInputs(unsigned hIn1[DIM_SIZE][DIM_SIZE], + unsigned hIn2[DIM_SIZE][DIM_SIZE], + bool isShift = false) { + int val1 = 25; + int val2 = 59; + for (auto r = 0U; r != DIM_SIZE; ++r) { + for (auto c = 0U; c != DIM_SIZE; ++c) { + hIn1[r][c] = (1 - 2 * (r & 1)) * (r + val1); + hIn2[r][c] = (1 - 2 * ((r + c) & 1)) * (r + c + val2); + + // Shifting by more than 32 is undefined. + if (isShift) { + hIn2[r][c] = hIn2[r][c] % 32; + if (hIn2[r][c] < 0) + hIn2[r][c] = -hIn2[r][c]; + } + } + } +} + template void convertToPositive(T array[DIM_SIZE][DIM_SIZE]) { for (auto i = 0U; i < DIM_SIZE; ++i) { for (auto j = 0U; j < DIM_SIZE; ++j) { @@ -399,7 +419,7 @@ void powTest() { } } -template void selectTest() { +template void selectTest(bool inPlace = false) { auto type = equivalent_device_type().value; auto device = createTestDevice(deviceType); Graph graph(device.getTarget()); @@ -420,7 +440,12 @@ template void selectTest() { Tensor in3 = mapUnaryOpTensor(graph, BOOL); auto prog = Sequence(); - auto out = select(graph, in1, in2, in3, prog); + Tensor out = in1; + if (!inPlace) { + out = select(graph, in1, in2, in3, prog); + } else { + selectInPlace(graph, in1, in2, in3, prog); + } graph.createHostWrite("in1", in1); graph.createHostWrite("in2", in2); graph.createHostWrite("in3", in3); @@ -2087,13 +2112,23 @@ int main(int argc, char **argv) { } else if (test == "SelectHalfLHSAndRHSConst") { selectTestHalfLHSAndRHSConst(); } else if (test == "SelectInt") { - selectTest(); + selectTest(false); + } else if (test == "SelectUInt") { + selectTest(false); + } else if (test == "SelectInPlaceInt") { + selectTest(true); + } else if (test == "SelectInPlaceUInt") { + selectTest(true); } else if (test == "BroadcastSelectorSelectInt") { broadcastSelectorSelectTest(false); + } else if (test == "BroadcastSelectorSelectUInt") { + broadcastSelectorSelectTest(false); } else if (test == "BroadcastSelectorSelectFloat") { broadcastSelectorSelectTest(false); } else if (test == "BroadcastSelectorSelectInPlaceInt") { broadcastSelectorSelectTest(true); + } else if (test == "BroadcastSelectorSelectInPlaceUInt") { + broadcastSelectorSelectTest(true); } else if (test == "BroadcastSelectorSelectInPlaceFloat") { broadcastSelectorSelectTest(true); } else if (test == "ClampFloat") { diff --git a/tests/popops/codelets/BroadcastSelect.cpp.in b/tests/popops/codelets/BroadcastSelect.cpp.in index 1a8784ad..9881e463 100644 --- a/tests/popops/codelets/BroadcastSelect.cpp.in +++ b/tests/popops/codelets/BroadcastSelect.cpp.in @@ -1,3 +1,5 @@ +// Copyright (c) 2021 Graphcore Ltd. All rights reserved. + // Tests for 'BroadcastSelect' and 'BroadcastSelectorSelect[InPlace]' codelets. // This file will create 3 boost test cases (one for each codelet). // @@ -38,6 +40,8 @@ typedef float TEST_TYPE; #elif defined(DATA_TYPE_bool) typedef unsigned char TEST_TYPE; +#elif defined(DATA_TYPE_unsigned_int) +typedef unsigned TEST_TYPE; #else typedef @DATA_TYPE@ TEST_TYPE; #endif @@ -211,7 +215,7 @@ BOOST_AUTO_TEST_CASE(BROADCAST_SELECT_TYPE) { BroadcastSelectTest(false, false, true); BroadcastSelectTest(true, true, true); BroadcastSelectTest(false, false, true); -#elif defined(DATA_TYPE_int) +#elif defined(DATA_TYPE_int) || defined(DATA_TYPE_unsigned_int) BroadcastSelectTest(0xcafebabe, 0xdeadbeef, 0x12345678); #else BroadcastSelectTest(31.4159, 27.1828, 666.0); diff --git a/tests/popops/codelets/CMakeLists.txt b/tests/popops/codelets/CMakeLists.txt index 2045651a..9c056b69 100644 --- a/tests/popops/codelets/CMakeLists.txt +++ b/tests/popops/codelets/CMakeLists.txt @@ -384,7 +384,7 @@ foreach(TYPE half float) endforeach() # Select -foreach(DATA_TYPE int float half bool) +foreach(DATA_TYPE int unsigned_int float half bool) set(VARIANT_NAME "Select_${DATA_TYPE}") set(VARIANT_SRC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT_NAME}.cpp") string(TOUPPER "${DATA_TYPE}" TYPE) @@ -395,7 +395,7 @@ foreach(DATA_TYPE int float half bool) endforeach() # BroadcastSelect -foreach(DATA_TYPE int float half bool) +foreach(DATA_TYPE int unsigned_int float half bool) set(VARIANT_NAME "BroadcastSelect_${DATA_TYPE}") set(VARIANT_SRC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT_NAME}.cpp") string(TOUPPER "${DATA_TYPE}" TYPE) diff --git a/tests/popops/codelets/select/unsigned_int.hpp b/tests/popops/codelets/select/unsigned_int.hpp new file mode 100644 index 00000000..3438dcd2 --- /dev/null +++ b/tests/popops/codelets/select/unsigned_int.hpp @@ -0,0 +1,25 @@ +// Copyright (c) 2018 Graphcore Ltd. All rights reserved. +const std::vector> in1 = {{}, + {11}, + {31, 33}, + {51, 53, 55}, + {71, 73, 75, 77}, + {91, 93, 95, 97, 99}, + {1, 3, 5, 7, 9, 21, 23, 25, 27}}; + +const std::vector> in2 = {{}, + {20}, + {40, 42}, + {60, 62, 64}, + {80, 82, 84, 86}, + {100, 102, 104, 106, 108}, + {2, 4, 6, 8, 10, 22, 24, 26, 28}}; + +const std::vector> expected = { + {}, + {11}, + {40, 33}, + {60, 53, 64}, + {71, 82, 75, 86}, + {91, 93, 104, 106, 99}, + {2, 4, 6, 7, 9, 21, 24, 25, 28}}; diff --git a/tools/topk.cpp b/tools/topk.cpp index e6744fee..b598b12d 100644 --- a/tools/topk.cpp +++ b/tools/topk.cpp @@ -2,14 +2,20 @@ #include +#include #include #include #include #include +#include #include #include #include +#include +#include +#include #include +#include #include #include @@ -18,12 +24,65 @@ using namespace poplar; using namespace poplar::program; using namespace poplibs_support; using namespace poplibs_test::util; +using namespace poputil; #define FLOAT_REL_TOL 1e-6 #define HALF_REL_TOL 1e-5 #define FLOAT_ABS_TOL 1e-6 #define HALF_ABS_TOL 1e-5 +enum class API { + Popops, + Popnn, + PopopsSort, +}; + +inline std::ostream &operator<<(std::ostream &os, const API &api) { + switch (api) { + case API::Popops: + os << "popops"; + break; + case API::Popnn: + os << "popnn"; + break; + case API::PopopsSort: + os << "popops-sort"; + default: + throw poplibs_error("Unhandled API type"); + } + return os; +} + +inline std::istream &operator>>(std::istream &is, API &api) { + std::string token; + is >> token; + if (token == "popops") { + api = API::Popops; + } else if (token == "popnn") { + api = API::Popnn; + } else if (token == "popops-sort") { + api = API::PopopsSort; + } else { + throw poplibs_error("Unknown API type '" + token + "'"); + } + return is; +} + +inline std::istream &operator>>(std::istream &is, popops::SortOrder &o) { + std::string token; + is >> token; + if (token == "none") { + o = popops::SortOrder::NONE; + } else if (token == "ascending") { + o = popops::SortOrder::ASCENDING; + } else if (token == "descending") { + o = popops::SortOrder::DESCENDING; + } else { + throw poplibs_error("Unknown sort order '" + token + "'"); + } + return is; +} + int main(int argc, char **argv) try { namespace po = boost::program_options; @@ -33,7 +92,11 @@ int main(int argc, char **argv) try { boost::optional tilesPerIPU; Type dataType = FLOAT; Type indexType = UNSIGNED_INT; - bool sortOutput = true; + bool largest = true; + popops::SortOrder sortOrder = popops::SortOrder::ASCENDING; + API api = API::Popops; + bool returnIndices = true; + bool returnValues = true; po::options_description desc("Options"); // clang-format off @@ -52,6 +115,8 @@ int main(int argc, char **argv) try { ("ignore-data", "Don't upload/download to/from the device and consequently don't " "validate results") + ("report-total-cycles", + "Print total cycle count for the whole operation") ("n", po::value(&n)->required(), "Number of input elements") @@ -68,9 +133,23 @@ int main(int argc, char **argv) try { ("index-type", po::value(&indexType)->default_value(indexType), "The type of the indices") - ("sort-output", - po::value(&sortOutput)->default_value(sortOutput), - "Ensure the output of the top-k is sorted") + ("largest", + po::value(&largest)->default_value(largest), + "If true return the top k largest elements, otherwise return top k smallest elements") + ("sort-order", + po::value(&sortOrder)->default_value(sortOrder), + "Sort order of the output of the top-k") + ("return-indices", + po::value(&returnIndices)->default_value(returnIndices), + "Use API returning indices of top k values") + ("return-values", + po::value(&returnValues)->default_value(returnValues), + "Use API returning top k values") + ("api", + po::value(&api)->default_value(api), + "Which API to use (popops | popnn)") + ("random-seed", + "Use a random seed") ; // clang-format on @@ -90,24 +169,82 @@ int main(int argc, char **argv) try { } bool profile = vm.count("profile"); - bool ignoreData = vm.count("ignore-data"); bool showVarStorage = vm.count("show-var-storage"); + bool reportTotalCycles = vm.count("report-total-cycles"); + bool randomSeed = vm.count("random-seed"); + + if (profile && reportTotalCycles) { + std::cerr + << "Can't report total cycles and profile at the same time as " + "profiling instrumentation would skew total cycles measurement\n"; + return 1; + } // If k was not explicitly provided, set it equal to n if (!vm.count("k")) { k = n; } - std::cout << "Top-K with batch-size " << batchSize << ", input size " << n - << ", and output size " << k << "\n"; + if (!returnIndices && !returnValues) { + std::cerr + << "At least one of return-indices and return-values must be true\n"; + return 1; + } + + switch (api) { + case API::Popops: + // Nothing. Popops API supports all arguments. + break; + case API::Popnn: + if (!returnIndices) { + std::cerr << "Warning: popnn API only supports returning both indices " + "and values. Forcing return-indices on\n"; + returnIndices = true; + } + if (!returnValues) { + std::cerr << "Warning: popnn API only supports returning both indices " + "and values. Forcing return-values on\n"; + returnValues = true; + } + if (sortOrder == popops::SortOrder::ASCENDING) { + std::cerr << "Warning: popnn API only supports returning values sorted " + "in descending order. Forcing sort-order to Descending\n"; + sortOrder = popops::SortOrder::DESCENDING; + } + if (!largest) { + std::cerr << "Warning: popnn API only supports returning top k largest " + "values. Forcing largest to true\n"; + largest = true; + } + break; + case API::PopopsSort: + if (n != k) { + std::cerr << "Warning: popops sort API only supports full sort. Forcing " + "k equal to n\n"; + k = n; + } + if (returnIndices && returnValues) { + std::cerr << "Warning: popops sort API only supports returning either " + "keys or values not both. Forcing returnValues to false\n"; + returnValues = false; + } + break; + } constexpr bool alwaysCompileCode = true; auto device = createTestDevice(deviceType, 1, tilesPerIPU, alwaysCompileCode); const auto &target = device.getTarget(); Graph graph(target); - popnn::addCodelets(graph); + if (api == API::Popnn) { + popnn::addCodelets(graph); + } else if (api == API::Popops || api == API::PopopsSort) { + popops::addCodelets(graph); + } - const auto in = graph.addVariable(dataType, {batchSize, n}, "in"); + const std::vector inShape = {batchSize, n}; + const std::vector outShape = {batchSize, k}; + + const auto in = graph.addVariable(dataType, inShape, "in"); // TODO: Eventually we should have an allocation function for the inputs // which will probably just map linearly with some kind of grain size. poputil::mapTensorLinearly(graph, in); @@ -116,15 +253,69 @@ int main(int argc, char **argv) try { Sequence prog, uploadProg, downloadProg; Tensor outIndices; - Tensor outValues = - popnn::topK(graph, in, outIndices, k, sortOutput, prog, "top-k"); + Tensor outValues; + if (api == API::Popnn) { + const bool sorted = sortOrder != popops::SortOrder::NONE; + outValues = popnn::topK(graph, in, outIndices, k, sorted, prog, "top-k"); + // Weirdly this interface seems to return the partials dimension as part of + // the result, even though this is always 1? + outValues = outValues.squeeze({1}); + outIndices = outIndices.squeeze({1}); + } else if (api == API::Popops) { + const popops::TopKParams params(k, largest, sortOrder); + if (returnIndices) { + std::tie(outValues, outIndices) = + popops::topKWithPermutation(graph, prog, in, params, "top-k"); + } else { + outValues = popops::topK(graph, prog, in, params, "top-k"); + } + } else if (api == API::PopopsSort) { + if (returnValues) { + outValues = popops::sort(graph, in, 1, prog, "top-k"); + } else { + std::vector batchIndices(n); + std::iota(batchIndices.begin(), batchIndices.end(), 0); + const auto iota = graph.addConstant( + indexType, {1, n}, ArrayRef(batchIndices), "indicesInitializer"); + poputil::mapTensorLinearly(graph, iota); + auto indices = iota.broadcast(batchSize, 0); + outIndices = popops::sortKeyValue(graph, in, indices, 1, prog, "top-k"); + } + } + + // Check types/shapes returned by the API + if (returnIndices) { + if (outIndices.elementType() != indexType) { + std::cerr << "Actual index type (" << outIndices.elementType() + << ") is not the requested index type ( " << indexType << ")\n"; + return 1; + } + if (outIndices.shape() != outShape) { + std::cerr << "Shape of returned indices (" << outIndices.shape() + << ") does not match the expected shape (" << outShape << ")\n"; + return 1; + } + } + if (returnValues) { + if (outValues.elementType() != dataType) { + std::cerr << "Actual value type (" << outIndices.elementType() + << ") is not the requested value type ( " << dataType << ")\n"; + } + if (outValues.shape() != outShape) { + std::cerr << "Shape of returned values (" << outValues.shape() + << ") does not match the expected shape (" << outShape << ")\n"; + return 1; + } + } std::unique_ptr rawHostIn, rawHostIndicesOut, rawHostValuesOut; - if (!ignoreData) { - rawHostIn = allocateHostMemoryForTensor(in, "in", graph, uploadProg, - downloadProg, tmap); + rawHostIn = allocateHostMemoryForTensor(in, "in", graph, uploadProg, + downloadProg, tmap); + if (returnIndices) { rawHostIndicesOut = allocateHostMemoryForTensor( outIndices, "outIndices", graph, uploadProg, downloadProg, tmap); + } + if (returnValues) { rawHostValuesOut = allocateHostMemoryForTensor( outValues, "outValues", graph, uploadProg, downloadProg, tmap); } @@ -134,6 +325,12 @@ int main(int argc, char **argv) try { engineOptions.set("debug.instrument", "true"); } + Tensor cycleCounter; + if (reportTotalCycles) { + cycleCounter = cycleCount(graph, prog, 0, "measure-total-cycles"); + graph.createHostRead("totalCycleCount", cycleCounter); + } + Engine engine(graph, Sequence(uploadProg, prog, downloadProg), engineOptions); attachStreams(engine, tmap); @@ -142,58 +339,99 @@ int main(int argc, char **argv) try { std::vector hostValuesOut(batchSize * k); std::mt19937 randomEngine; - if (!ignoreData) { - writeRandomValues(target, dataType, hostIn, -50.0, 50.0, randomEngine); - copy(target, hostIn, dataType, rawHostIn.get()); + if (randomSeed) { + const auto seed = std::random_device{}(); + std::cout << "Seeding random engine with seed " << seed << "\n"; + randomEngine.seed(seed); } + // TODO: Check what happens with NaN values... + writeRandomValues(target, dataType, hostIn, -50.0, 50.0, randomEngine); + copy(target, hostIn, dataType, rawHostIn.get()); - device.bind([&](const Device &d) { engine.loadAndRun(d); }); + device.bind([&](const Device &d) { + engine.loadAndRun(d); + if (reportTotalCycles) { + std::uint64_t cycleCount; + engine.readTensor("totalCycleCount", &cycleCount); + std::cout << "Total cycles for top-k program were " << cycleCount << "\n"; + } + }); bool matchesModel = true; - if (!ignoreData) { + if (returnIndices) { copy(target, indexType, rawHostIndicesOut.get(), hostIndicesOut); + } + if (returnValues) { copy(target, dataType, rawHostValuesOut.get(), hostValuesOut); - // Verify against top-k on the host. - std::vector modelIndicesOut(batchSize * k); - std::vector modelValuesOut(batchSize * k); - { - std::vector indices(n); - for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) { - std::iota(indices.begin(), indices.end(), 0); - std::partial_sort(indices.begin(), indices.begin() + k, indices.end(), - [&](unsigned a, unsigned b) { - return hostIn[batchIdx * n + a] > - hostIn[batchIdx * n + b]; - }); - for (unsigned i = 0; i < k; ++i) { - modelIndicesOut[batchIdx * k + i] = indices[i]; - modelValuesOut[batchIdx * k + i] = hostIn[batchIdx * n + indices[i]]; - } + } + + // Verify against top-k on the host. + const auto partialSortComparator = + [&]() -> std::function { + if (largest) { + return std::greater{}; + } else { + return std::less{}; + } + }(); + + std::vector modelIndicesOut(batchSize * k); + std::vector modelValuesOut(batchSize * k); + { + std::vector indices(n); + for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) { + std::iota(indices.begin(), indices.end(), 0); + std::partial_sort(indices.begin(), indices.begin() + k, indices.end(), + [&](unsigned a, unsigned b) { + return partialSortComparator( + hostIn[batchIdx * n + a], + hostIn[batchIdx * n + b]); + }); + // Result of the partial sort will be ordered differently depending on + // whether we wanted the largest or smallest top-k elements, so ensure the + // requested correct ordering. + if ((largest && sortOrder == popops::SortOrder::ASCENDING) || + (!largest && sortOrder == popops::SortOrder::DESCENDING)) { + std::reverse(indices.begin(), indices.begin() + k); + } + for (unsigned i = 0; i < k; ++i) { + modelIndicesOut[batchIdx * k + i] = indices[i]; + modelValuesOut[batchIdx * k + i] = hostIn[batchIdx * n + indices[i]]; } } + } - // If the output isn't already supposed to be sorted, sort it on the host - // so that we can compare element for element to check the result. - if (!sortOutput) { - std::vector sortedIndices(k); - std::vector valBuffer(k); - std::vector indexBuffer(k); - for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) { - std::iota(sortedIndices.begin(), sortedIndices.end(), 0); - std::sort(sortedIndices.begin(), sortedIndices.end(), - [&](unsigned a, unsigned b) { - return hostValuesOut[batchIdx * k + a] > - hostValuesOut[batchIdx * k + b]; - }); + // If the output isn't already supposed to be sorted, sort it on the host + // so that we can compare element for element to check the result. + if (sortOrder == popops::SortOrder::NONE) { + std::vector sortedIndices(k); + std::vector valBuffer(k); + std::vector indexBuffer(k); + for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) { + std::iota(sortedIndices.begin(), sortedIndices.end(), 0); + std::sort(sortedIndices.begin(), sortedIndices.end(), + [&](unsigned a, unsigned b) { + return partialSortComparator(hostValuesOut[batchIdx * k + a], + hostValuesOut[batchIdx * k + b]); + }); + if (returnIndices) { std::copy_n(&hostIndicesOut[batchIdx * k], k, indexBuffer.begin()); - std::copy_n(&hostValuesOut[batchIdx * k], k, valBuffer.begin()); for (unsigned i = 0; i < k; ++i) { hostIndicesOut[batchIdx * k + i] = indexBuffer[sortedIndices[i]]; + } + } + if (returnValues) { + std::copy_n(&hostValuesOut[batchIdx * k], k, valBuffer.begin()); + for (unsigned i = 0; i < k; ++i) { hostValuesOut[batchIdx * k + i] = valBuffer[sortedIndices[i]]; } } } + } + double relTolerance = dataType == FLOAT ? FLOAT_REL_TOL : HALF_REL_TOL; + double absTolerance = dataType == FLOAT ? FLOAT_ABS_TOL : HALF_ABS_TOL; + if (returnIndices) { // Because 2 values might be equal and therefore the order of the indices // is not well defined, we don't directly check the indices but instead // check the values the indices point to match the data. @@ -211,11 +449,11 @@ int main(int argc, char **argv) try { } } } - double relTolerance = dataType == FLOAT ? FLOAT_REL_TOL : HALF_REL_TOL; - double absTolerance = dataType == FLOAT ? FLOAT_ABS_TOL : HALF_ABS_TOL; matchesModel &= checkIsClose("indexedValues", indexedValues.data(), {batchSize, k}, modelValuesOut.data(), batchSize * k, relTolerance, absTolerance); + } + if (returnValues) { matchesModel &= checkIsClose("values", hostValuesOut.data(), {batchSize, k}, modelValuesOut.data(), batchSize * k, relTolerance, absTolerance);