diff --git a/include/poplibs_support/Algorithm.hpp b/include/poplibs_support/Algorithm.hpp
index b9368c54..4926cec6 100644
--- a/include/poplibs_support/Algorithm.hpp
+++ b/include/poplibs_support/Algorithm.hpp
@@ -26,6 +26,11 @@ template <typename T, typename U> constexpr static auto ceildiv(T x, U y) {
   return (x + y - 1) / y;
 }
 
+/// ceildiv with 0/0 = 0
+template <typename T> static inline T ceildiv0(T q, T d) {
+  return q + d != 0 ? (q + d - 1) / d : 0;
+}
+
 template <typename T, typename U> constexpr static auto floordiv(T x, U y) {
   static_assert(std::is_unsigned<T>::value && std::is_unsigned<U>::value,
                 "Only valid for unsigned integral types");
@@ -79,6 +84,18 @@ constexpr static std::pair<T, T> balancedPartition(T n, T d) {
   return std::make_pair(a, b);
 }
 
+/// Compute the nth triangular number.
+template <typename T> static inline T nthTriangular(T n) {
+  static_assert(std::is_unsigned_v<T>);
+  return (n * (n + 1)) / 2;
+}
+
+/// Check if a number is a power of 2.
+template <typename T> static inline bool isPowerOf2(T n) {
+  static_assert(std::is_unsigned_v<T>);
+  return ((n & (n - 1)) == 0);
+}
+
 } // end namespace poplibs_support
 
 #endif // poplibs_support_Algorithm_hpp
diff --git a/include/popops/SortOrder.hpp b/include/popops/SortOrder.hpp
new file mode 100644
index 00000000..8d8bed86
--- /dev/null
+++ b/include/popops/SortOrder.hpp
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
+
+#ifndef _popops_SortOrder_hpp_
+#define _popops_SortOrder_hpp_
+
+#include <iosfwd>
+
+namespace popops {
+
+/// Defines a required order for sorting operations.
+enum class SortOrder {
+  /// No ordering is required.
+  NONE,
+  /// Sort in ascending order.
+  ASCENDING,
+  /// Sort in descending order.
+  DESCENDING
+};
+
+std::ostream &operator<<(std::ostream &os, const SortOrder &o);
+
+} // end namespace popops
+
+#endif // _popops_SortOrder_hpp_
diff --git a/include/popops/TopK.hpp b/include/popops/TopK.hpp
new file mode 100644
index 00000000..fef95cf7
--- /dev/null
+++ b/include/popops/TopK.hpp
@@ -0,0 +1,111 @@
+// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+
+#ifndef _popops_TopK_hpp_
+#define _popops_TopK_hpp_
+
+#include <poplar/Graph.hpp>
+#include <poplar/Program.hpp>
+#include <poplar/Tensor.hpp>
+
+#include <popops/SortOrder.hpp>
+
+namespace popops {
+
+/** Parameters for topK* APIs
+ */
+struct TopKParams {
+  /// The number of outputs from the top k operation.
+  /// This must be less or equal the number of elements in the innermost
+  /// dimension of the tensor used as input to the operation.
+  unsigned k;
+  /// If true, return the top k largest elements. Otherwise return the
+  /// top k smallest elements.
+  bool largest;
+  /// The required ordering of elements in the resulting tensor.
+  SortOrder sortOrder;
+
+  TopKParams(unsigned k, bool largest, SortOrder sortOrder) noexcept;
+};
+
+std::ostream &operator<<(std::ostream &os, const TopKParams &p);
+
+/** Create an return a new tensor laid out optimally to be used as
+ *  an input to a topK operation with the given parameters.
+ *
+ *  \param graph        The Poplar graph to add the tensor to.
+ *  \param type         The Poplar type of elements in the returned tensor.
+ *  \param shape        The shape of the returned tensor.
+ *  \param params       The parameters of the top k that the returned tensor
+ *                      will be used as input to.
+ *  \param debugContext Optional debug information.
+ *
+ *  \returns A newly created tensor with shape \p shape and full tile mapping.
+ */
+poplar::Tensor createTopKInput(poplar::Graph &graph, const poplar::Type &type,
+                               const std::vector<std::size_t> &shape,
+                               const TopKParams &params,
+                               const poplar::DebugContext &debugContext = {});
+
+/** Return the top k values in the innermost dimension of a tensor.
+ *
+ *  \param graph        The Poplar graph to add the operation to.
+ *  \param prog         The Poplar sequence to add the operation to.
+ *  \param t            The tensor in which to find the top-k values in
+ *                      the innermost dimension.
+ *  \param params       The parameters of the top k.
+ *  \param debugContext Optional debug information.
+ *
+ *  \returns A tensor with the top k values found in the innermost dimension
+ *           of \p t.
+ */
+poplar::Tensor topK(poplar::Graph &graph, poplar::program::Sequence &prog,
+                    const poplar::Tensor &t, const TopKParams &params,
+                    const poplar::DebugContext &debugContext = {});
+
+/** Return the top k values in the innermost dimension of a tensor along
+ *  with the permutation of another tensor with respect to the values.
+ *
+ *  \param graph        The Poplar graph to add the operation to.
+ *  \param prog         The Poplar sequence to add the operation to.
+ *  \param key          The tensor in which to find the top-k values in
+ *                      the innermost dimension.
+ *  \param value        A tensor with the same shape as \p key for which to
+ *                      get the permutation with respect to \p key.
+ *  \param params       The parameters of the top k.
+ *  \param debugContext Optional debug information.
+ *
+ *  \returns A pair of tensors. The first contains the top k values found
+ *           in the innermost dimension of \p key. The second contains the
+ *           permutation of the tensor \p value with respect to the tensor
+ *           \p key.
+ */
+std::pair<poplar::Tensor, poplar::Tensor>
+topKKeyValue(poplar::Graph &graph, poplar::program::Sequence &prog,
+             const poplar::Tensor &keys, const poplar::Tensor &values,
+             const TopKParams &params,
+             const poplar::DebugContext &debugContext = {});
+
+/** Return the top k values in the innermost dimension of a tensor along
+ *  with the indices of those values in the input tensor in the innermost
+ *  dimension.
+ *
+ *  \param graph        The Poplar graph to add the operation to.
+ *  \param prog         The Poplar sequence to add the operation to.
+ *  \param t            The tensor in which to find the top-k values in
+ *                      the innermost dimension.
+ *  \param params       The parameters of the top k.
+ *  \param debugContext Optional debug information.
+ *
+ *  \returns A pair of tensors. The first contains the top k values found
+ *           in the innermost dimension of \p t. The second contains the
+ *           indices of those values in the innermost dimension of \p t in
+ *           the original input.
+ */
+std::pair<poplar::Tensor, poplar::Tensor>
+topKWithPermutation(poplar::Graph &graph, poplar::program::Sequence &prog,
+                    const poplar::Tensor &t, const TopKParams &params,
+                    const poplar::DebugContext &debugContext = {});
+
+} // end namespace popops
+
+#endif // _popops_TopK_hpp_
diff --git a/lib/poplibs_test/Util.cpp b/lib/poplibs_test/Util.cpp
index 0367d4fa..f77a9db1 100644
--- a/lib/poplibs_test/Util.cpp
+++ b/lib/poplibs_test/Util.cpp
@@ -199,6 +199,7 @@ bool checkIsClose(FPType a, FPType b, double relativeTolerance) {
 
 template bool checkIsClose<bool>(bool, bool, double);
 template bool checkIsClose<int>(int, int, double);
+template bool checkIsClose<unsigned>(unsigned, unsigned, double);
 template bool checkIsClose<float>(float, float, double);
 template bool checkIsClose<double>(double, double, double);
 
diff --git a/lib/popops/BitonicTopK.cpp b/lib/popops/BitonicTopK.cpp
new file mode 100644
index 00000000..8f74ba15
--- /dev/null
+++ b/lib/popops/BitonicTopK.cpp
@@ -0,0 +1,982 @@
+// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+
+#include <popops/Cast.hpp>
+#include <popops/Encoding.hpp>
+#include <popops/TopK.hpp>
+
+#include <poputil/DebugInfo.hpp>
+#include <poputil/TileMapping.hpp>
+#include <poputil/Util.hpp>
+
+#include <poplibs_support/Algorithm.hpp>
+#include <poplibs_support/VectorUtils.hpp>
+#include <poplibs_support/gcd.hpp>
+#include <poplibs_support/logging.hpp>
+#include <poplibs_support/print.hpp>
+
+#include <type_traits>
+#include <unordered_map>
+
+#include <boost/functional/hash.hpp>
+
+using namespace poplar;
+using namespace poplar::program;
+using namespace poplibs_support;
+using namespace poputil;
+
+namespace poputil {
+
+template <> poplar::ProfileValue toProfileValue(const popops::SortOrder &o) {
+  switch (o) {
+  case popops::SortOrder::NONE:
+    return poplar::ProfileValue("NONE");
+  case popops::SortOrder::ASCENDING:
+    return poplar::ProfileValue("ASCENDING");
+  case popops::SortOrder::DESCENDING:
+    return poplar::ProfileValue("DESCENDING");
+  default:
+    return poplar::ProfileValue("<UNKNOWN>");
+  }
+}
+} // namespace poputil
+
+namespace popops {
+namespace bitonic {
+
+/** This returns the number of comparisons we will do at the given distance
+ *  into a flat array with n elements.
+ *
+ *  e.g. n = 15 and distance = 4
+ *
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *  |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *    +--------------->               +--------------->
+ *        +--------------->               +--------------->
+ *            +--------------->               +--------------->
+ *                +--------------->
+ *
+ *  gives the number of comparisons as 7.
+ */
+template <typename T>
+static inline T numComparisonsAtDistance(T n, T distance) {
+  const auto q = floordiv(n, distance * 2);
+  const auto r = n % (distance * 2);
+  const auto comparisons = q * distance + r - std::min(r, distance);
+  return comparisons;
+}
+
+namespace {
+
+struct TensorKey {
+  unsigned n;
+  unsigned comparisons;
+  unsigned distance;
+  TensorKey(unsigned n, unsigned distance, unsigned nActive)
+      : n(n), comparisons(numComparisonsAtDistance(nActive, distance)),
+        distance(distance) {}
+  inline bool operator<(const TensorKey &other) const {
+    return std::tie(n, comparisons, distance) <
+           std::tie(other.n, other.comparisons, other.distance);
+  }
+  inline bool operator==(const TensorKey &other) const {
+    return std::tie(n, comparisons, distance) ==
+           std::tie(other.n, other.comparisons, other.distance);
+  }
+};
+
+struct TensorKeyHash {
+  std::size_t operator()(const TensorKey &k) const {
+    std::size_t seed = 0;
+    boost::hash_combine(seed, k.n);
+    boost::hash_combine(seed, k.comparisons);
+    boost::hash_combine(seed, k.distance);
+    return seed;
+  }
+};
+
+using TensorCache = std::unordered_map<TensorKey, Tensor, TensorKeyHash>;
+
+} // end anonymous namespace
+
+/** Allocates a tensor to be used when doing a compare and swap step
+ *  with the given parameters. In particular this manages the correct
+ *  mapping of elements to the tiles on which they will be processed
+ *  and correct balancing of elements not processed across remaining
+ *  tiles.
+ *
+ *  This is closely tied to the compareAndSwapAtDistance function.
+ */
+static Tensor allocate(Graph &graph, const Type &type, std::size_t n,
+                       std::size_t nActive, std::size_t distance,
+                       const DebugNameAndId &dnai) {
+  const auto t = graph.addVariable(type, {n}, {dnai});
+  const auto numTiles = graph.getTarget().getNumTiles();
+
+  const auto comparisons = numComparisonsAtDistance(nActive, distance);
+  // Allocation in each stage is done by first allocating to tiles elements
+  // that are 'active' i.e. take part in a comparison at the given distance.
+  const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles);
+  const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile);
+
+  // We know from the total number of elements n what the layout will be for
+  // a tensor with distance equal to 1, which we use to determine how to
+  // spread inactive elements over tiles.
+  const auto maxActivePairs = floordiv(n, 2u);
+  const auto maxAllocatedPairsPerTile = ceildiv(maxActivePairs, numTiles);
+  const auto maxUsedTiles = ceildiv0(maxActivePairs, maxAllocatedPairsPerTile);
+
+  // First allocate first comparisons * 2 elements to tiles according to
+  // which will process those comparisons.
+  for (unsigned tile = 0; tile < activeTiles; ++tile) {
+    const auto begin = tile * maxComparisonsPerTile;
+    const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons);
+    graph.setTileMapping(t.slice(begin * 2, end * 2), tile);
+  }
+  // Then allocate inactive elements to tiles, balancing total number of
+  // elements allocated as much as possible.
+  auto offset = comparisons * 2;
+  auto inactiveRemaining = n - comparisons * 2;
+  for (unsigned tile = 0; tile < maxUsedTiles; ++tile) {
+    std::size_t activePairsThisTile = 0;
+    if (tile < activeTiles) {
+      const auto begin = tile * maxComparisonsPerTile;
+      const auto end =
+          std::min((tile + 1) * maxComparisonsPerTile, comparisons);
+      activePairsThisTile = (end - begin);
+    }
+    const auto inactiveThisTile =
+        std::min((maxAllocatedPairsPerTile - activePairsThisTile) * 2,
+                 inactiveRemaining);
+    graph.setTileMapping(t.slice(offset, offset + inactiveThisTile), tile);
+    inactiveRemaining -= inactiveThisTile;
+    offset += inactiveThisTile;
+  }
+
+  // If there is an odd element, we'll arbitrarily map it to the last tile.
+  if (offset < n) {
+    // We assume there is only ever 1 'odd' element
+    assert(n - offset == 1);
+    graph.setTileMapping(t.slice(offset, offset + 1),
+                         std::max(maxUsedTiles, 1ul) - 1);
+    offset += 1;
+  }
+
+  assert(offset == n);
+
+  return t;
+}
+
+/** This gives the inverse of a permutation of elements of a tensor.
+ */
+static std::vector<Interval>
+getInversePermutation(const std::vector<Interval> &is) {
+  std::vector<Interval> matchingSlices;
+  matchingSlices.reserve(is.size());
+  std::size_t offset = 0;
+  for (const auto &i : is) {
+    matchingSlices.emplace_back(offset, offset + i.size());
+    offset += i.size();
+  }
+
+  std::vector<std::size_t> sortedIndices(is.size());
+  std::iota(sortedIndices.begin(), sortedIndices.end(), 0);
+  std::sort(sortedIndices.begin(), sortedIndices.end(),
+            [&](const auto a, const auto b) {
+              return is[a].begin() < is[b].begin();
+            });
+
+  std::vector<Interval> inverse;
+  inverse.reserve(is.size());
+  for (const auto &i : sortedIndices) {
+    inverse.push_back(matchingSlices[i]);
+  }
+  return inverse;
+}
+
+/** Given an offset into a number of comparisons to do and the distance
+ *  at which to compare, this gives the offset in elements for the lhs
+ *  element taking part on that comparison in a flat array.
+ *
+ *  The calculation is essentially taking the number of multiples of
+ *  the distance d * 2 to get number of elements plus any remainder.
+ *
+ *  e.g. for distance = 4 we have a flat array and the comparisons that
+ *  will be done on that array
+ *
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *  |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *    +--------------->               +--------------->
+ *    0   +--------------->           4   +--------------->
+ *        1   +--------------->           5
+ *            2   +--------------->
+ *                3
+ *
+ *  For comparison offset 3 we have no multiples of distance (4), plus
+ *  the remainder 3, so the offset for the lhs element in that comparison
+ *  is 3 as can be seen from the diagram. For comparison offset 5 we have
+ *  1 multiple of 4, multiplied by 2 and add the remainder 1 equals 9 which
+ *  again is evident from the diagram.
+ */
+template <typename T>
+static inline T comparisonToElemOffset(T offset, T distance) {
+  return roundDown(offset, distance) * 2 + offset % distance;
+}
+
+/** Given a tensor which was allocated with the given parameters, reorder
+ *  it to give the canonical order. The canonical order is with all elements
+ *  in the order they were given to the top-k operation.
+ *
+ *  We use the canonical order as a common ordering in which to transform
+ *  tensors between different steps of the top-k operation.
+ */
+static Tensor toCanonicalOrder(const Graph &graph, const Tensor &t,
+                               std::size_t distance, std::size_t nActive) {
+  assert(t.rank() == 1);
+
+  const auto n = t.dim(0);
+
+  // Already in canonical order
+  if (distance == 1 && nActive == n) {
+    return t;
+  }
+
+  const auto numTiles = graph.getTarget().getNumTiles();
+
+  const auto comparisons = numComparisonsAtDistance(nActive, distance);
+  const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles);
+  const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile);
+
+  // This function just combines slices that are actually continuous.
+  std::vector<Interval> slices;
+  // Just reserve something, the exact number is hard to calculate up front.
+  slices.reserve(activeTiles);
+  const auto appendSlice = [&](const auto begin, const auto end) {
+    if (begin == end) {
+      return;
+    }
+    if (!slices.empty() && slices.back().end() == begin) {
+      slices.back() = Interval(slices.back().begin(), end);
+    } else {
+      slices.emplace_back(begin, end);
+    }
+  };
+
+  // We actually build the permutation of the canonically ordered tensor's
+  // elements you would take to get the input tensor and then get the inverse
+  // permutation and apply that to the tensor to retrieve the canonical form.
+  for (unsigned tile = 0; tile < activeTiles; ++tile) {
+    auto begin = tile * maxComparisonsPerTile;
+    const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons);
+
+    if (const auto pre = std::min(distance - begin % distance, end - begin)) {
+      const auto elemOffset = comparisonToElemOffset(begin, distance);
+      appendSlice(elemOffset, elemOffset + pre);
+      appendSlice(elemOffset + distance, elemOffset + distance + pre);
+      begin += pre;
+    }
+    // Add multiples of 2 * distance chunks of elements.
+    if (const auto multipleOfDistanceComparisons =
+            roundDown(end - begin, distance)) {
+      appendSlice(begin * 2, (begin + multipleOfDistanceComparisons) * 2);
+      begin += multipleOfDistanceComparisons;
+    }
+    if (const auto post = end - begin) {
+      const auto elemOffset = comparisonToElemOffset(begin, distance);
+      appendSlice(elemOffset, elemOffset + post);
+      appendSlice(elemOffset + distance, elemOffset + distance + post);
+      begin += post;
+    }
+    assert(begin == end);
+  }
+
+  // This method is tied closely with the bitonic allocate method, any inactive
+  // elements when comparing at a particular distance are always at the end of
+  // the tensor so we can just tack them on.
+  const auto remainingOffset = comparisonToElemOffset(comparisons, distance);
+  appendSlice(remainingOffset, remainingOffset + n - comparisons * 2);
+
+  return concat(t.slices(getInversePermutation(slices)));
+}
+
+/** Allocate a new tensor and rearrange the given input into it in preparation
+ *  for a compare and swap step with the given parameters.
+ *
+ *  Assumes src is in canonical order.
+ */
+static Tensor rearrangeForStep(Graph &graph, Sequence &prog, const Tensor &src,
+                               unsigned dstDistance, unsigned dstActive,
+                               TensorCache &tensorCache,
+                               const DebugNameAndId &dnai) {
+  const auto n = src.dim(0);
+  const TensorKey dstKey(n, dstDistance, dstActive);
+  auto cacheIt = tensorCache.find(dstKey);
+  if (cacheIt == tensorCache.end()) {
+    const auto t =
+        allocate(graph, src.elementType(), n, dstActive, dstDistance, dnai);
+    cacheIt = tensorCache.emplace(dstKey, t).first;
+  }
+  const auto &dst = cacheIt->second;
+  const auto &dstReordered =
+      toCanonicalOrder(graph, dst, dstDistance, dstActive);
+  prog.add(Copy(src, dstReordered));
+  return dst;
+}
+
+/** WorklistBuilder is a one-time use class that is used per-tile to
+ *  build worklists based on the available work on a tile.
+ */
+struct WorklistBuilder {
+  const unsigned numWorkers;
+  const unsigned maxComparisonsPerWorker;
+  const bool initialOrder;
+  const unsigned distanceToChangeOrder;
+  unsigned globalComparisonOffset;
+
+  using Worklists = std::vector<std::vector<std::size_t>>;
+  Worklists lists;
+
+  // The number of entries in the worklist for the current worker.
+  unsigned numWorkerEntries = 0;
+  // The number of remaining comparisons this worker can do based
+  // on the maximum per worker.
+  unsigned remainingWorkerComparisons = 0;
+  // The offset into the data on this tile so far.
+  unsigned tileOffset = 0;
+
+  WorklistBuilder(unsigned numWorkers, unsigned maxComparisonsPerWorker,
+                  bool initialOrder, unsigned distanceToChangeOrder,
+                  unsigned globalComparisonOffset)
+      : numWorkers(numWorkers),
+        maxComparisonsPerWorker(maxComparisonsPerWorker),
+        initialOrder(initialOrder),
+        distanceToChangeOrder(distanceToChangeOrder),
+        globalComparisonOffset(globalComparisonOffset) {
+    lists.reserve(numWorkers);
+  }
+
+  // Work is limited by and assumed to be a multiple of the given distance
+  // hence the parameters are the distance and a multiple, rather than a more
+  // flexible total number of comparisons.
+  void addWork(unsigned distance, unsigned repeats = 1) {
+    const auto numComparisons = distance * repeats;
+    auto remainingComparisons = numComparisons;
+
+    unsigned comparisonOffset = 0;
+    // NOTE: The encoding of the worklists is described alongside the c++
+    // implementation of the vertex.
+    while (remainingComparisons != 0) {
+      // Start a new worker
+      if (remainingWorkerComparisons == 0) {
+        // Fill out the number of entries for the last worker if there
+        // was one.
+        if (numWorkerEntries != 0) {
+          assert(!lists.empty());
+          lists.back().front() = numWorkerEntries - 1;
+          numWorkerEntries = 0;
+        }
+
+        lists.emplace_back();
+        // Reserve a spot for number of entries in the worklist
+        lists.back().emplace_back();
+        const auto currWorkOffset =
+            comparisonToElemOffset(comparisonOffset, distance);
+        lists.back().emplace_back(tileOffset + currWorkOffset);
+
+        // Knowing the starting order of comparison over the whole
+        // input, we can work out for the slice of comparisons that
+        // this worker does what order it should start comparing
+        // in by using the starting offset for this worker into the
+        // whole input to determine how many changes of order have come before.
+        const bool workerInitialOrder =
+            initialOrder ^
+            ((globalComparisonOffset / distanceToChangeOrder) & 1u);
+        // The worker keeps a count of comparisons left before it must
+        // flip the order in which it compares and swaps. Because the
+        // worker may start at any offset into the comparisons to be
+        // done over the whole input, we provide a count to initialise
+        // the counter with based on the offset into the whole input that
+        // this worker starts its work. The worker will calculate
+        // the remaining compare and swaps to do before flipping the order
+        // by taking distanceToChangeOrder - initialCount.
+        const auto initialCount =
+            globalComparisonOffset % distanceToChangeOrder;
+        assert(initialCount < (1u << 31));
+        // We encode the initial direction and initial offset into
+        // 32-bits in the worklist where the upper 31 bits are allocated
+        // to the initial count and the lowest bit is allocated to the
+        // initial order.
+        const unsigned short initialLower =
+            ((initialCount & ((1u << 15u) - 1)) << 1u) | workerInitialOrder;
+        const unsigned short initialUpper =
+            (initialCount >> 15u) & ((1u << 16) - 1);
+        lists.back().emplace_back(initialLower);
+        lists.back().emplace_back(initialUpper);
+
+        remainingWorkerComparisons = maxComparisonsPerWorker;
+
+        // Similar to initialCount, due to the fact that a worker can
+        // start at any offset into the work to be done on this tile/
+        // over the whole input, we provide an initial count for the
+        // innermost loop that would ordinarily otherwise be set to
+        // 'distance' for the first entry in the worklist.
+        const auto firstInnerCount = std::min(
+            distance - comparisonOffset % distance,
+            std::min(remainingComparisons, remainingWorkerComparisons));
+        lists.back().emplace_back(firstInnerCount);
+      }
+
+      const auto comparisonsThisEntry =
+          std::min(remainingComparisons, remainingWorkerComparisons);
+
+      lists.back().emplace_back(distance);
+      lists.back().emplace_back(comparisonsThisEntry);
+
+      comparisonOffset += comparisonsThisEntry;
+      globalComparisonOffset += comparisonsThisEntry;
+      remainingWorkerComparisons -= comparisonsThisEntry;
+      remainingComparisons -= comparisonsThisEntry;
+      ++numWorkerEntries;
+    }
+    tileOffset += numComparisons * 2;
+  }
+
+  // Invalidates the builder and returns the worklists that were built.
+  Worklists finish() {
+    if (numWorkerEntries != 0) {
+      assert(!lists.empty());
+      lists.back().front() = numWorkerEntries - 1;
+      numWorkerEntries = 0;
+    }
+    return std::move(lists);
+  }
+};
+
+/** The basic building block for the algorithm. This step
+ *  performs a compare and swap at the given distance if the input
+ *  tensors were flat array in canonical order.
+ *
+ *  e.g. nActive = 15, distance = 4, distanceToChangeOrder = 4
+ *
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *  |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *    +--------------->               <---------------+
+ *        +--------------->               <---------------+
+ *            +--------------->               <---------------+
+ *                +--------------->
+ *
+ *  e.g. nActive = 15, distance = 2, distanceToChangeOrder = 4
+ *
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *  |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ *    +------->       +------->       <-------+       <-------+
+ *        +------->       +------->       <-------+
+ *
+ *  In these examples each arrow indicates the 2 elements are compared,
+ *  and if the element the arrow points to is less than the element it
+ *  starts from, the elements are swapped.
+ *
+ *  We add some limitations to this step in order to appease the
+ *  hardware. The main one is that the distance is consistent for the
+ *  entire operation, as is the distance at which we flip the order of
+ *  comparison.
+ *
+ *  The actual implementation maps comparisons to tiles by taking the
+ *  flat array, and evenly splitting the work amongst tiles. Where the
+ *  lhs and rhs of the comparison reside on different tiles, we gather
+ *  the comparisons to be done in chunks of powers of 2. This limits
+ *  the amount of exchange code required to rearrange for each step,
+ *  and limits the overhead we add would add by having the vertex deal
+ *  with different distances at which to compare to a number of different
+ *  distances that is roughly logarithmic with respect to the number
+ *  of elements per tile (in each compare and swap step).
+ */
+static std::string getVertexClass(const Tensor &keys,
+                                  const std::optional<Tensor> &values) {
+  if (values) {
+    return templateVertex("popops::CompareAndSwapAtDistanceKeyVal",
+                          keys.elementType(), values->elementType());
+  } else {
+    return templateVertex("popops::CompareAndSwapAtDistance",
+                          keys.elementType());
+  }
+}
+static void
+compareAndSwapAtDistance(Graph &graph, Sequence &prog, const Tensor &keys,
+                         const std::optional<Tensor> &values, unsigned distance,
+                         unsigned distanceToChangeOrder, bool initialOrder,
+                         unsigned nActive, const DebugNameAndId &dnai) {
+  assert(!values || keys.shape() == values->shape());
+
+  const auto vertexClass = getVertexClass(keys, values);
+
+  const auto &target = graph.getTarget();
+  const auto numTiles = target.getNumTiles();
+  const auto numWorkers = target.getNumWorkerContexts();
+
+  const auto comparisons = numComparisonsAtDistance(nActive, distance);
+  const auto maxComparisonsPerTile = ceildiv(comparisons, numTiles);
+  const auto activeTiles = ceildiv0(comparisons, maxComparisonsPerTile);
+
+  const auto cs = graph.addComputeSet({dnai, "CompareAndSwap"});
+  for (unsigned tile = 0; tile < activeTiles; ++tile) {
+    auto begin = tile * maxComparisonsPerTile;
+    const auto end = std::min((tile + 1) * maxComparisonsPerTile, comparisons);
+    const auto comparisonsThisTile = end - begin;
+
+    const auto v = graph.addVertex(cs, vertexClass);
+    graph.setTileMapping(v, tile);
+
+    graph.connect(v["keys"], keys.slice(begin * 2, end * 2).flatten());
+    if (values) {
+      graph.connect(v["values"], values->slice(begin * 2, end * 2).flatten());
+    }
+
+    graph.setInitialValue(v["distanceToChangeOrder"], distanceToChangeOrder);
+
+    // We just flatten all the comparisons to be done and evenly spread
+    // them amongst workers for now without consideration of how the split
+    // might introduce uneven overheads between workers.
+    const auto comparisonsPerWorker = ceildiv(comparisonsThisTile, numWorkers);
+    WorklistBuilder builder(numWorkers, comparisonsPerWorker, initialOrder,
+                            distanceToChangeOrder, begin);
+
+    if (const auto pre = std::min(distance - begin % distance, end - begin)) {
+      builder.addWork(pre);
+      begin += pre;
+    }
+    if (const auto multipleOfDistanceComparisons =
+            floordiv(end - begin, distance)) {
+      builder.addWork(distance, multipleOfDistanceComparisons);
+      begin += multipleOfDistanceComparisons * distance;
+    }
+    if (const auto post = end - begin) {
+      builder.addWork(post);
+      begin += post;
+    }
+    assert(begin == end);
+
+    const auto &worklists = builder.finish();
+
+    const auto worklistsField = v["worklists"];
+    graph.setFieldSize(worklistsField, worklists.size());
+    for (unsigned i = 0; i < worklists.size(); ++i) {
+      const auto t =
+          graph.addConstant(UNSIGNED_SHORT, {worklists[i].size()},
+                            worklists[i].data(), {dnai, "worklists"});
+      graph.setTileMapping(t, tile);
+      graph.connect(worklistsField[i], t);
+    }
+  }
+  prog.add(Execute(cs));
+}
+
+/** Create an input for a top-k laid out in the most efficient
+ *  way.
+ */
+Tensor createTopKInputImpl(Graph &graph, const Type &type,
+                           const std::vector<std::size_t> &shape,
+                           const DebugNameAndId &dnai) {
+  if (shape.empty()) {
+    throw poplibs_error("shape must have at least 1 dimension");
+  }
+  const auto numElems = product(shape);
+  const auto n = shape.back();
+  const auto b = numElems / n;
+  auto t = allocate(graph, type, b * n, b * n, 1, dnai);
+  return t.reshape({n, b}).transpose().reshape(shape);
+}
+
+/** Core implementation for all variants.
+ *
+ *  All the variants are implemented by application of our basic
+ *  compareAndSwapAtDistance operation some number of times with the
+ *  right parameters. A key limitation of compareAndSwapAtDistance is that
+ *  it requires us to compare and swap at a consistent distance across the
+ *  entire array, flipping the order imposed at another consistent interval.
+ *
+ *  ========= Sort =========
+ *  I will not describe a bitonic sorting network for powers of 2 as this is
+ *  well documented. For n a power of 2 the above limitation on
+ *  compareAndSwapAtDistance is not a problem, as the size of partitions in
+ *  each step (the distance at which we compare and swap) is always a
+ *  consistent factor of 2 greater than the last step across the whole n
+ *  elements to be sorted.
+ *
+ *  For n not a power of 2 we compose the sorting network of power-of-2
+ *  sized sorting networks. e.g. if n=100, we compose the full sort with
+ *  smaller sorting networks of size 64, 32, 4 (64 + 32 + 4 = 100). The
+ *  result of these power of 2 sized networks are merged and re-sorted
+ *  hierarchically.
+ *
+ *  With regards the limitation on compare and swap described above,
+ *  the key insight is that when all the networks are of size a power of 2,
+ *  sorting the sub-sequences into order always uses ascending powers of 2
+ *  distances at which to compare and swap. Once the sub-sequences are in
+ *  order merging them to form a larger non-power of 2 sized sequence is
+ *  always a compare and swap at the next largest power of 2. We can therefore
+ *  apply a series of compare and swap steps with increasing powers of 2
+ *  distances to form our final fully sorted sequence and in each step the
+ *  distance to compare and swap is always consistent across the whole
+ *  array.
+ *
+ *  When merging the result of our power of 2 sized sorting networks,
+ *  we must take care. In these cases we can do one of the following:
+ *
+ *   * Given an ascending/descending bitonic sequence s of size n and
+ *     known inflection point p, we can compare and swap at distance
+ *     p with descending order, and the result is guaranteed to be such
+ *     that all in the set of elements left of p are greater than all
+ *     in the set of elements right of p.
+ *   * Conversely, given a descending/ascending bitonic sequence s of
+ *     size n and known inflection point p, we can compare and swap at
+ *     distance p with ascending order, and the result is guaranteed
+ *     to be such that all in the set of elements left of p are less
+ *     than all in the set of elements right of p.
+ *
+ *  In order to use the above this means each time we merge sorted power
+ *  of 2 sequences, they must be sorted in a particular order. Different
+ *  powers of 2 will take different numbers of compare and swap steps to
+ *  end up in the correct order for merging. Since the smaller powers of
+ *  2 must always take part in the same or fewer compare and swap operations
+ *  than the larger powers of 2, we can carefully omit them from the
+ *  right compare and swap operations on larger powers of 2 to ensure they
+ *  still end up in the right order for merging and we get the correct
+ *  result.
+ *
+ *  ========= Top-K =========
+ *  In order to handle top-k, the general idea is to build sorted sequences
+ *  of size k with alternating ascending/descending order, allowing us to
+ *  merge 2 sequences of k with opposing ordering giving a result where one of
+ *  the sequences has elements which are all greater than the elements in
+ *  the other sequence. Each time we do this, we discard the smaller
+ *  sequence and re-sort the sequences of k elements into ascending/
+ *  descending order and repeat until we are left with k elements.
+ *
+ *  We make a small modification to this which is we actually first build
+ *  sorted sequences of k' elements where k' is the next power of 2 greater
+ *  then or equal to k. This again lets us keep our distance of compare and
+ *  swap consistent in each step and doesn't further complicate the compare
+ *  and swap operation. To get the final result if k is not a power of 2
+ *  we keep sorting and discarding descreasing sized powers of 2 until we
+ *  are left with just k elements.
+ */
+std::pair<Tensor, Tensor> topKImpl(Graph &graph, Sequence &prog,
+                                   const Tensor &t_,
+                                   const std::optional<Tensor> &other_,
+                                   const unsigned k, const bool largest,
+                                   const bool sorted, const bool ascendingOrder,
+                                   const DebugNameAndId &dnai) {
+
+  const auto inputType = t_.elementType();
+  if (inputType != HALF && inputType != FLOAT) {
+    throw poplibs_error(
+        "Unsupported data type for top-k " + inputType.toString() +
+        ". half and float are the only supported types at present.");
+  }
+
+  if (other_) {
+    const auto otherType = other_->elementType();
+    if (otherType != UNSIGNED_INT) {
+      throw poplibs_error("Unsupported data type for other tensor in top-k " +
+                          otherType.toString() +
+                          ". Only unsigned int is supported at present");
+    }
+    if (other_->shape() != t_.shape()) {
+      throw poplibs_error("t.shape() (" + toString(t_.shape()) +
+                          " != other.shape() (" + toString(other_->shape()) +
+                          "). Other tensor in topKKeyValue must have matching "
+                          "shape to tensor in which to find top-k.");
+    }
+  }
+
+  if (t_.rank() == 0) {
+    throw poplibs_error("t must have at least one dimension");
+  }
+
+  const std::vector<std::size_t> outputShape = [&] {
+    auto s = t_.shape();
+    s.back() = k;
+    return s;
+  }();
+
+  // We use {n, batch} as our canonical internal representation for
+  // the shape.
+  auto t =
+      t_.rank() >= 2 ? t_.flatten(0, t_.rank() - 1) : t_.flatten().expand({0});
+  t = t.transpose();
+  std::optional<Tensor> other;
+  if (other_) {
+    other = other_->rank() >= 2 ? other_->flatten(0, other_->rank() - 1)
+                                : other_->flatten().expand({0});
+    other = other->transpose();
+  }
+
+  unsigned n = t.dim(0);
+  const unsigned b = t.dim(1);
+
+  logging::popops::debug("bitonicTopK(batchSize={}, n={}, k={}, sorted={}, "
+                         "haveOther={}, debugPath='{}')",
+                         b, n, k, sorted, (other ? "true" : "false"),
+                         dnai.getPathName());
+
+  if (k > n) {
+    throw poplibs_error(
+        "k (" + std::to_string(k) +
+        ") must be less than or equal to the number of input elements (" +
+        std::to_string(n) + ")");
+  }
+
+  const auto logK = ceilLog2(k);
+  const auto logN = ceilLog2(n);
+  // We define k' as the next power of 2 greater or equal than k.
+  const auto kDash = (1u << logK);
+  const auto stepsToSortK = nthTriangular(logK);
+  const auto totalSteps = nthTriangular(logN);
+
+  logging::popops::debug(
+      "bitonicTopKImpl: Calculated no. of steps: total={}, k={}", totalSteps,
+      stepsToSortK);
+
+  // Edge cases where the output is zero-sized.
+  if (b * k == 0) {
+    Tensor tResult =
+        graph.addVariable(t.elementType(), outputShape, {dnai, "keys"});
+    Tensor otherResult;
+    if (other) {
+      otherResult = graph.addVariable(other->elementType(), outputShape,
+                                      {dnai, "values"});
+    }
+    return std::make_pair(std::move(tResult), std::move(otherResult));
+  }
+
+  t = t.flatten();
+  if (other) {
+    other = other->flatten();
+  }
+
+  // Handle some trivial cases where this is a no-op.
+  //
+  //  * We already have the top/bottom k elements and the result doesn't
+  //    need to sorted.
+  //  * n is 1, the output is sorted by default.
+  if ((k == n && !sorted) || n == 1) {
+    // We assume the result does not alias the input and is writeable so
+    // ensure that is the case.
+    t = poputil::duplicate(graph, t, prog, {dnai, "keys"});
+    t = t.reshape({k, b}).transpose().reshape(outputShape);
+    if (other) {
+      other = poputil::duplicate(graph, *other, prog, {dnai, "values"});
+      other = other->reshape({k, b}).transpose().reshape(outputShape);
+    }
+    return std::make_pair(std::move(t), other.value_or(Tensor{}));
+  }
+
+  // We use the float path to implement half data type for now.
+  const auto dataType = FLOAT;
+  if (inputType != dataType) {
+    t = cast(graph, t, dataType, prog, dnai);
+  }
+
+  // Because we always discard the upper of a pair of sequences of k'
+  // elements, the merge order when merging sequences is based off
+  // leaving the top/bottom k' elements in each pair of k' sequences
+  // in the lower k' elements.
+  const bool mergeKSequencesOrder = !largest;
+  const bool oddKSequences = bool((n / kDash) & 1u);
+
+  TensorCache tCache, otherCache;
+  for (unsigned mergeStep = 0; mergeStep < logN; ++mergeStep) {
+    const auto logMergeDistance = std::min(mergeStep, logK);
+    const auto mergeDistance = 1u << logMergeDistance;
+    const auto stepName = "Merge" + std::to_string(mergeStep);
+
+    const bool mergeOrder = [&] {
+      bool order;
+      if (mergeStep >= logK) {
+        // Use the merge order for merging sequences of k' elements if that
+        // is what we are doing.
+        order = mergeKSequencesOrder;
+      } else {
+        // If this is the last step, we need to sort in the desired order.
+        if (mergeStep == logN - 1) {
+          order = ascendingOrder;
+          // If this is the last step of sorting k' sequences, make sure they
+          // are sorted in the order needed for merging sequences of k'
+          // elements.
+        } else if (mergeStep + 1 == logK) {
+          order = !mergeKSequencesOrder;
+        } else {
+          // Otherwise, we merge based on creating sorted sequences in the
+          // order needed either to create the final sorted sequence or to
+          // merge k' sequences correctly.
+          order = logK != logN ? (mergeKSequencesOrder ^ oddKSequences)
+                               : !ascendingOrder;
+        }
+      }
+      return order;
+    }();
+
+    // As mentioned at the signature of this function, we must sometimes
+    // omit the parts of n composed of smaller powers of 2 in order to
+    // result in the correctly ordered sequence once merged with another
+    // power of 2. This check was sort of experimentally worked out. In
+    // the interests of time, I'm leaving it as it works but this should
+    // be justified and there may be a simpler calculation.
+    const auto nMod2d = n % (mergeDistance * 2);
+    const auto nMod4d = n % (mergeDistance * 4);
+    auto nThisStep = n;
+    if (mergeStep + 1 < logK && (nMod2d == 0 || nMod2d == nMod4d)) {
+      nThisStep -= nMod4d;
+    }
+
+    t = rearrangeForStep(graph, prog, t, mergeDistance * b, nThisStep * b,
+                         tCache, {dnai, "keys" + stepName});
+    if (other) {
+      other = rearrangeForStep(graph, prog, *other, mergeDistance * b,
+                               nThisStep * b, otherCache,
+                               {dnai, "values" + stepName});
+    }
+
+    // While we are building sorted sequences of k' elements, the distance
+    // at which we change direction is the same as the merge distance,
+    // otherwise the direction should always be descending as we always
+    // discard the higher k' elements.
+    const auto changeDirDistance =
+        mergeStep < logK ? mergeDistance : 1u << (logN - 1);
+    compareAndSwapAtDistance(graph, prog, t, other, mergeDistance * b,
+                             changeDirDistance * b, mergeOrder, nThisStep * b,
+                             {dnai, stepName});
+
+    t = toCanonicalOrder(graph, t, mergeDistance * b, nThisStep * b);
+    if (other) {
+      other = toCanonicalOrder(graph, *other, mergeDistance * b, nThisStep * b);
+    }
+
+    // If we're done building sequences of k' elements, and we still have
+    // more than k' elements, discard the upper k' elements of each
+    // pair of k' sized sequences as by this point we have imposed an
+    // ordering between them.
+    if (logMergeDistance >= logK && n > kDash) {
+      const auto kDashMultiples = floordiv(n, 2 * kDash);
+      const auto remainder = n - kDashMultiples * 2 * kDash;
+      const auto t2d = t.reshape({n, b});
+      const auto evenPart =
+          t2d.slice(0, kDashMultiples * 2 * kDash)
+              .reshapePartial(0, 1, {kDashMultiples, 2 * kDash});
+      const auto oddPart = t2d.slice(kDashMultiples * 2 * kDash, n);
+      t = concat(evenPart.slice(0, kDash, 1).flatten(0, 2),
+                 oddPart.slice(0, std::min(remainder, kDash)))
+              .flatten();
+      if (other) {
+        const auto t2d = other->reshape({n, b});
+        const auto evenPart =
+            t2d.slice(0, kDashMultiples * 2 * kDash)
+                .reshapePartial(0, 1, {kDashMultiples, 2 * kDash});
+        const auto oddPart = t2d.slice(kDashMultiples * 2 * kDash, n);
+        other = concat(evenPart.slice(0, kDash, 1).flatten(0, 2),
+                       oddPart.slice(0, std::min(remainder, kDash)))
+                    .flatten();
+      }
+      n = nThisStep = kDashMultiples * kDash + std::min(remainder, kDash);
+    }
+
+    for (unsigned sortStep = 0; sortStep < logMergeDistance; ++sortStep) {
+      const auto stepName =
+          "Sort" + std::to_string(logMergeDistance - sortStep - 1);
+
+      const auto sortDistance = 1u << (logMergeDistance - sortStep - 1);
+      // Once we have built sorted sequences of k' elements, we keep
+      // merging and re-sorting so the distance at which we change
+      // directions once we start discarding is always k'/2.
+      const auto changeDirDistance =
+          std::min(mergeDistance, (1u << (logK - 1)));
+
+      const bool sortOrder = [&] {
+        bool order;
+        if (mergeStep < logK) {
+          // If we are still building sorted sequences of k' elements, the
+          // sort order is the same as the merge order
+          order = mergeOrder;
+        } else {
+          if (mergeStep == logN - 1) {
+            // If this is the last step then sort into the final desired
+            // order.
+            order = ascendingOrder;
+          } else {
+            // Otherwise sort into the order needed to correctly merge
+            // sequences of k' elements.
+            order = !mergeKSequencesOrder;
+          }
+        }
+        return order;
+      }();
+
+      t = rearrangeForStep(graph, prog, t, sortDistance * b, nThisStep * b,
+                           tCache, {dnai, "keys" + stepName});
+      if (other) {
+        other = rearrangeForStep(graph, prog, *other, sortDistance * b,
+                                 nThisStep * b, otherCache,
+                                 {dnai, "values" + stepName});
+      }
+
+      compareAndSwapAtDistance(graph, prog, t, other, sortDistance * b,
+                               changeDirDistance * b, sortOrder, nThisStep * b,
+                               {dnai, stepName});
+      t = toCanonicalOrder(graph, t, sortDistance * b, nThisStep * b);
+      if (other) {
+        other =
+            toCanonicalOrder(graph, *other, sortDistance * b, nThisStep * b);
+      }
+
+      // If we have finished building sequences of k' elements and
+      // there are more than k elements remaining, we can discard descending
+      // powers of 2 during the re-sort until we are left with exactly k
+      // elements.
+      const auto lastKDashElements = ((n - 1) % kDash) + 1;
+      // We either discard elements from the start or end of the last k'
+      // elements depending on whether we want the top or bottom k elements
+      // and depending on the sort order and offset into the array when it
+      // was sorted.
+      const bool fromStart =
+          sortOrder ^ !largest ^ bool(((n - lastKDashElements) / kDash) & 1u);
+      const auto nToDiscard =
+          fromStart ? sortDistance
+                    : ((lastKDashElements - 1) % sortDistance) + 1;
+      if (logMergeDistance + 1 >= logK && lastKDashElements >= nToDiscard &&
+          lastKDashElements - nToDiscard >= k) {
+        std::vector<Interval> slices;
+        if (fromStart) {
+          slices.emplace_back(0, n - lastKDashElements);
+          slices.emplace_back(n - lastKDashElements + nToDiscard, n);
+        } else {
+          slices.emplace_back(0, n - nToDiscard);
+        }
+        const auto t2d = t.reshape({n, b});
+        t = concat(t2d.slices(slices)).flatten();
+        if (other) {
+          const auto t2d = other->reshape({n, b});
+          other = concat(t2d.slices(slices)).flatten();
+        }
+        n -= nToDiscard;
+        nThisStep = n;
+      }
+    }
+  }
+
+  if (inputType != dataType) {
+    t = cast(graph, t, inputType, prog, dnai);
+  }
+
+  t = t.reshape({k, b}).transpose().reshape(outputShape);
+  if (other) {
+    other = other->reshape({k, b}).transpose().reshape(outputShape);
+  }
+  return std::make_pair(std::move(t), other.value_or(Tensor{}));
+}
+
+} // end namespace bitonic
+} // end namespace popops
diff --git a/lib/popops/BitonicTopK.hpp b/lib/popops/BitonicTopK.hpp
new file mode 100644
index 00000000..c20d9bac
--- /dev/null
+++ b/lib/popops/BitonicTopK.hpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+
+#ifndef _popops_BitonicTopK_hpp_
+#define _popops_BitonicTopK_hpp_
+
+#include <optional>
+#include <vector>
+
+#include <poplar/DebugContext.hpp>
+#include <poplar/Graph.hpp>
+#include <poplar/Program.hpp>
+#include <poplar/Tensor.hpp>
+
+namespace popops {
+namespace bitonic {
+
+poplar::Tensor createTopKInputImpl(poplar::Graph &graph,
+                                   const poplar::Type &type,
+                                   const std::vector<std::size_t> &shape,
+                                   const poplar::DebugNameAndId &dnai = {});
+
+/// Implementation of topK using bitonic sort based method.
+/// Returns a pair of top k values in t, and matching permutation
+/// of \p other if it was given.
+std::pair<poplar::Tensor, poplar::Tensor>
+topKImpl(poplar::Graph &graph, poplar::program::Sequence &prog,
+         const poplar::Tensor &t, const std::optional<poplar::Tensor> &other,
+         const unsigned k, const bool largest, const bool sorted,
+         const bool ascendingOrder, const poplar::DebugNameAndId &dnai = {});
+
+} // end namespace bitonic
+} // end namespace popops
+
+#endif // _popops_BitonicTopK_hpp_
diff --git a/lib/popops/CMakeLists.txt b/lib/popops/CMakeLists.txt
index 9f6812ab..88c1e771 100644
--- a/lib/popops/CMakeLists.txt
+++ b/lib/popops/CMakeLists.txt
@@ -5,6 +5,8 @@ get_target_property(POPC_EXECUTABLE popc_bin LOCATION)
 
 add_library(popops SHARED
   AllTrue.cpp
+  BitonicTopK.cpp
+  BitonicTopK.hpp
   Cast.cpp
   CircBuf.cpp
   TensorCollectives.cpp
@@ -36,7 +38,9 @@ add_library(popops SHARED
   SelectScalarFromRows.cpp
   SequenceSlice.cpp
   Sort.cpp
+  SortOrder.cpp
   SparseUtils.cpp
+  TopK.cpp
   UpdateScalarInRows.cpp
   VarianceToOrFromInvStdDev.cpp
   Zero.cpp
@@ -61,6 +65,8 @@ add_library(popops SHARED
   ${CMAKE_SOURCE_DIR}/include/popops/Rearrange.hpp
   ${CMAKE_SOURCE_DIR}/include/popops/SequenceSlice.hpp
   ${CMAKE_SOURCE_DIR}/include/popops/Reduce.hpp
+  ${CMAKE_SOURCE_DIR}/include/popops/SortOrder.hpp
+  ${CMAKE_SOURCE_DIR}/include/popops/TopK.hpp
   ${CMAKE_SOURCE_DIR}/include/popops/Zero.hpp
   ${CMAKE_SOURCE_DIR}/include/poplibs_support/popopsPerformanceEstimation.hpp
 
@@ -175,6 +181,7 @@ add_gp_library(
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/BroadcastVectorInnerSupervisor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CircBufIncrIndex.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CircOffset.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codelets/CompareAndSwapAtDistance.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/ContinuousReduce.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/DynamicSlice1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codelets/DynamicSlice2d.cpp
diff --git a/lib/popops/ElementWise.cpp b/lib/popops/ElementWise.cpp
index 76ce8228..060fde02 100644
--- a/lib/popops/ElementWise.cpp
+++ b/lib/popops/ElementWise.cpp
@@ -420,6 +420,8 @@ unsigned maxVertexElementsPerRegion(const Target &target, const Type &outType,
       return 2;
     } else if (eType == FLOAT) {
       return 3;
+    } else if (eType == UNSIGNED_INT) {
+      return 4;
     } else {
       throw poplibs_error("Requested type to index conversion doesn't exist");
     }
@@ -428,22 +430,24 @@ unsigned maxVertexElementsPerRegion(const Target &target, const Type &outType,
   /* Assembler codelet implementations indicate how many elements are processed
    * per HW loop. If HW loop isn't in use or a codelet has only C implementation
    * them UINT_MAX shall be returned */
-  constexpr unsigned convMap[2][NR_OF_CODELETS][4] = {
+  constexpr unsigned convMap[2][NR_OF_CODELETS][5] = {
       {
           // None inPlace
-          {0, UINT_MAX, 2, 1},                      // Clamp
-          {0, UINT_MAX, 2, 1},                      // BroadcastClamp
-          {UINT_MAX, 1, UINT_MAX, 1},               // Select
-          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastSelect
-          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}  // BroadcastSelectorSelect
+          {0, UINT_MAX, 2, 1, UINT_MAX},                      // Clamp
+          {0, UINT_MAX, 2, 1, UINT_MAX},                      // BroadcastClamp
+          {UINT_MAX, 1, UINT_MAX, 1, UINT_MAX},               // Select
+          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // BroadcastSelect
+          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX,
+           UINT_MAX} // BroadcastSelectorSelect
       },
       {
           // inPlace
-          {0, UINT_MAX, UINT_MAX, UINT_MAX},        // Clamp
-          {0, UINT_MAX, UINT_MAX, UINT_MAX},        // BroadcastClamp
-          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // Select
-          {0, 0, 0, 0},                             // BroadcastSelect
-          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}  // BroadcastSelectorSelect
+          {0, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX},        // Clamp
+          {0, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX},        // BroadcastClamp
+          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}, // Select
+          {0, 0, 0, 0, 0},                                    // BroadcastSelect
+          {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX,
+           UINT_MAX} // BroadcastSelectorSelect
       }};
 
   unsigned inPlaceIdx = static_cast<unsigned>(inPlace);
diff --git a/lib/popops/SortOrder.cpp b/lib/popops/SortOrder.cpp
new file mode 100644
index 00000000..ffc4cd4d
--- /dev/null
+++ b/lib/popops/SortOrder.cpp
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
+
+#include <popops/SortOrder.hpp>
+#include <poputil/exceptions.hpp>
+
+#include <ostream>
+
+namespace popops {
+
+std::ostream &operator<<(std::ostream &os, const SortOrder &o) {
+  switch (o) {
+  case SortOrder::NONE:
+    os << "none";
+    break;
+  case SortOrder::ASCENDING:
+    os << "ascending";
+    break;
+  case SortOrder::DESCENDING:
+    os << "descending";
+    break;
+  default:
+    throw poputil::poplibs_error("Unhandled sort order");
+  }
+  return os;
+}
+
+} // end namespace popops
diff --git a/lib/popops/TopK.cpp b/lib/popops/TopK.cpp
new file mode 100644
index 00000000..f08e23c8
--- /dev/null
+++ b/lib/popops/TopK.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+
+#include <popops/TopK.hpp>
+
+#include <popops/Encoding.hpp>
+
+#include <poputil/DebugInfo.hpp>
+#include <poputil/TileMapping.hpp>
+
+#include <poplibs_support/logging.hpp>
+#include <poplibs_support/print.hpp>
+
+#include "BitonicTopK.hpp"
+
+using namespace poplar;
+using namespace poplar::program;
+using namespace poplibs_support;
+using namespace poputil;
+
+namespace poputil {
+
+template <> ProfileValue toProfileValue(const popops::TopKParams &p) {
+  ProfileValue::Map v;
+  v.emplace("k", toProfileValue(p.k));
+  v.emplace("largest", toProfileValue(p.largest));
+  v.emplace("sortOrder", toProfileValue(p.sortOrder));
+  return v;
+}
+
+} // namespace poputil
+
+namespace popops {
+
+TopKParams::TopKParams(unsigned k, bool largest, SortOrder sortOrder) noexcept
+    : k(k), largest(largest), sortOrder(sortOrder) {}
+
+std::ostream &operator<<(std::ostream &os, const TopKParams &p) {
+  os << "{"
+     << "k=" << p.k << ", largest=" << (p.largest ? "true" : "false")
+     << ", sortOrder=" << p.sortOrder << "}";
+  return os;
+}
+
+Tensor createTopKInput(Graph &graph, const Type &type,
+                       const std::vector<std::size_t> &shape,
+                       const TopKParams &params,
+                       const DebugContext &debugContext) {
+  logging::popops::info("createTopKInput(shape={}, params={}, debugPath='{}')",
+                        shape, params, debugContext.getPathName());
+  poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(shape, params),
+                                 "createTopKInput");
+  const auto t = bitonic::createTopKInputImpl(graph, type, shape, {di});
+  di.addOutput(t);
+  return t;
+}
+
+Tensor topK(Graph &graph, Sequence &prog, const Tensor &t,
+            const TopKParams &params, const DebugContext &debugContext) {
+  logging::popops::info("topK(shape={}, params={}, debugPath='{}')", t.shape(),
+                        params, debugContext.getPathName());
+  poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(t, params), "topK");
+  const bool sorted = params.sortOrder != SortOrder::NONE;
+  const bool ascending = params.sortOrder != SortOrder::DESCENDING;
+  const auto result = bitonic::topKImpl(graph, prog, t, std::nullopt, params.k,
+                                        params.largest, sorted, ascending, {di})
+                          .first;
+  di.addOutput(result);
+  return result;
+}
+
+std::pair<Tensor, Tensor> topKKeyValue(Graph &graph, Sequence &prog,
+                                       const Tensor &key, const Tensor &value,
+                                       const TopKParams &params,
+                                       const DebugContext &debugContext) {
+  logging::popops::info("topKKeyValue(shape={}, params={}, debugPath='{}')",
+                        key.shape(), params, debugContext.getPathName());
+  poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(key, value, params),
+                                 "topKKeyValue");
+  const bool sorted = params.sortOrder != SortOrder::NONE;
+  const bool ascending = params.sortOrder != SortOrder::DESCENDING;
+  const auto result =
+      bitonic::topKImpl(graph, prog, key, value, params.k, params.largest,
+                        sorted, ascending, {di});
+  di.addOutput(result.first);
+  di.addOutput(result.second);
+  return result;
+}
+
+std::pair<Tensor, Tensor>
+topKWithPermutation(Graph &graph, Sequence &prog, const Tensor &t,
+                    const TopKParams &params,
+                    const DebugContext &debugContext) {
+  logging::popops::info(
+      "topKWithPermutation(shape={}, params={}, debugPath='{}')", t.shape(),
+      params, debugContext.getPathName());
+  poputil::PoplibsOpDebugInfo di(debugContext, DI_ARGS(t, params));
+  const bool sorted = params.sortOrder != SortOrder::NONE;
+  const bool ascending = params.sortOrder != SortOrder::DESCENDING;
+
+  const auto numElems = t.numElements();
+  const auto n = t.dim(t.rank() - 1);
+  const auto b = numElems / t.dim(t.rank() - 1);
+  const auto indicesToPermute =
+      createTopKInput(graph, UNSIGNED_INT, t.shape(), params, {di, "indices"});
+  if (b == 1) {
+    iota(graph, indicesToPermute.flatten(), 0u, prog, {di});
+  } else {
+    // T34944, a 2-dimensional iota API with a second dimension in which to
+    // broadcast would be useful here.
+    std::vector<std::size_t> singleBatchIndices(n);
+    std::iota(singleBatchIndices.begin(), singleBatchIndices.end(), 0);
+    const auto iota =
+        graph.addConstant(UNSIGNED_INT, {1, n}, ArrayRef(singleBatchIndices),
+                          {di, "indicesInitializer"});
+    poputil::mapTensorLinearly(graph, iota);
+    prog.add(Copy(iota.broadcast(b, 0).flatten(), indicesToPermute.flatten()));
+  }
+  const auto result =
+      bitonic::topKImpl(graph, prog, t, indicesToPermute, params.k,
+                        params.largest, sorted, ascending, {di});
+  di.addOutput(result.first);
+  di.addOutput(result.second);
+  return result;
+}
+
+} // end namespace popops
diff --git a/lib/popops/codelets/CompareAndSwapAtDistance.cpp b/lib/popops/codelets/CompareAndSwapAtDistance.cpp
new file mode 100644
index 00000000..808c2ef3
--- /dev/null
+++ b/lib/popops/codelets/CompareAndSwapAtDistance.cpp
@@ -0,0 +1,184 @@
+// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+#include <poplar/Alignment.hpp>
+#include <poplar/HalfFloat.hpp>
+#include <poplar/Vertex.hpp>
+
+#include "poplibs_support/ExternalCodelet.hpp"
+#include "poplibs_support/TileConstants.hpp"
+
+using namespace poplar;
+
+static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR;
+static constexpr auto DELTAN = poplar::VectorListLayout::DELTANELEMENTS;
+
+namespace popops {
+
+namespace {
+
+using WorklistType = unsigned short;
+
+} // end anonymous namespace
+
+template <typename T> constexpr static inline T min(const T &a, const T &b) {
+  return a < b ? a : b;
+}
+
+// The format of the worklists for each worker is as follows in order as they
+// appear in memory.
+//
+// [numEntriesM1]         A count for the number of entries that will follow,
+//                        minus 1.
+//                        the always-present info in the worklist.
+// [initialOffset]        An offset in groups into the indices/values that
+//                        this worker will start at.
+// [initialOrder/initialCount] 2 entries to form a 32-bit packed field.
+//   [0-1: initialDirection] The initial direction for this worker
+//   [1-32:  initialCount]   The initial count for the counter that tracks
+//                        changes of direction.
+// [firstInnerElemCount]  Because the first entry for a worker may start
+//                        at a weird offset, the element count for the first
+//                        inner loop is given in the worklist.
+// numEntries * {
+//   [distance]             The distance at which to do compare and swap.
+//   [elemCount]            The total number of elements to compare and swap
+//                          for this entry.
+// }
+template <typename Impl>
+static void workerCompute(unsigned wid, Impl impl, const WorklistType *worklist,
+                          unsigned distanceToChangeOrder) {
+  const auto numEntriesM1 = *worklist++;
+  auto numEntries = numEntriesM1 + 1;
+  const auto initialOffset = *worklist++;
+  impl.increment(initialOffset);
+  auto *worklistUnsigned = reinterpret_cast<const unsigned *>(worklist);
+  const auto packedOrderAndCount = *worklistUnsigned++;
+  worklist = reinterpret_cast<const WorklistType *>(worklistUnsigned);
+  bool order = packedOrderAndCount & 1u;
+  const auto initialCount = packedOrderAndCount >> 1u;
+  auto changeOrderCounter = distanceToChangeOrder - initialCount;
+
+  unsigned innerElemCount = *worklist++;
+  // numEntries should never be 0
+  unsigned distance = *worklist++;
+  unsigned numElems = *worklist++;
+
+  while (numEntries-- > 0) {
+    while (numElems != 0) {
+      numElems -= innerElemCount;
+      changeOrderCounter -= innerElemCount;
+
+      // This will be a rpt loop.
+      while (innerElemCount-- > 0) {
+        impl.compareAndSwap(distance, order);
+        impl.increment(1);
+      }
+      impl.increment(distance);
+      innerElemCount = min(distance, numElems);
+      if (changeOrderCounter == 0) {
+        order = !order;
+        changeOrderCounter = distanceToChangeOrder;
+      }
+    }
+    // We will overread here but only by 2 worklist elements i.e.
+    // 4 bytes.
+    distance = *worklist++;
+    numElems = *worklist++;
+    innerElemCount = min(distance, numElems);
+  }
+}
+
+template <typename KeyType> constexpr inline bool hasAssemblyVersion() {
+  return false;
+}
+
+template <typename KeyType> struct KeyImpl {
+  KeyType *keys;
+  void compareAndSwap(unsigned distance, bool order) {
+    if (order == (keys[0] > keys[distance])) {
+      std::swap(keys[0], keys[distance]);
+    }
+  }
+  void increment(unsigned n) { keys += n; }
+};
+
+template <typename KeyType>
+class CompareAndSwapAtDistance
+    : public SupervisorVertexIf<hasAssemblyVersion<KeyType>() &&
+                                ASM_CODELETS_ENABLED> {
+public:
+  InOut<Vector<KeyType, ONE_PTR>> keys;
+
+  // Outer dimension of array indexed by worker context id
+  Input<VectorList<WorklistType, DELTAN, 4>> worklists;
+
+  // Used with the logical offset to determine the direction of comparison for
+  // a given comparison.
+  unsigned distanceToChangeOrder;
+
+  IS_EXTERNAL_CODELET((hasAssemblyVersion<KeyType>()));
+
+  bool compute() {
+    const auto numWorkers = worklists.size();
+    KeyImpl<KeyType> impl = {&keys[0]};
+    for (unsigned wid = 0; wid < numWorkers; ++wid) {
+      const WorklistType *worklist = &worklists[wid][0];
+      workerCompute(wid, impl, worklist, distanceToChangeOrder);
+    }
+    return true;
+  }
+};
+
+template class CompareAndSwapAtDistance<float>;
+
+template <typename KeyType, typename ValueType>
+constexpr inline bool hasAssemblyVersionKeyVal() {
+  return false;
+}
+
+template <typename KeyType, typename ValueType> struct KeyValImpl {
+  KeyType *keys;
+  ValueType *values;
+  void compareAndSwap(unsigned distance, bool order) {
+    if (order == (keys[0] > keys[distance])) {
+      std::swap(keys[0], keys[distance]);
+      std::swap(values[0], values[distance]);
+    }
+  }
+  void increment(unsigned n) {
+    keys += n;
+    values += n;
+  }
+};
+
+template <typename KeyType, typename ValueType>
+class CompareAndSwapAtDistanceKeyVal
+    : public SupervisorVertexIf<
+          hasAssemblyVersionKeyVal<KeyType, ValueType>() &&
+          ASM_CODELETS_ENABLED> {
+public:
+  InOut<Vector<KeyType, ONE_PTR>> keys;
+  InOut<Vector<ValueType, ONE_PTR>> values;
+
+  // Outer dimension of array indexed by worker context id
+  Input<VectorList<WorklistType, DELTAN, 4>> worklists;
+
+  // Used with the logical offset to determine the direction of comparison for
+  // a given comparison.
+  unsigned distanceToChangeOrder;
+
+  IS_EXTERNAL_CODELET((hasAssemblyVersionKeyVal<KeyType, ValueType>()));
+
+  bool compute() {
+    const auto numWorkers = worklists.size();
+    KeyValImpl<KeyType, ValueType> impl = {&keys[0], &values[0]};
+    for (unsigned wid = 0; wid < numWorkers; ++wid) {
+      const WorklistType *worklist = &worklists[wid][0];
+      workerCompute(wid, impl, worklist, distanceToChangeOrder);
+    }
+    return true;
+  }
+};
+
+template class CompareAndSwapAtDistanceKeyVal<float, unsigned>;
+
+} // end namespace popops
diff --git a/lib/popops/codelets/HeapSortVertexKV.cpp b/lib/popops/codelets/HeapSortVertexKV.cpp
index d58bf8c4..327ec9ca 100644
--- a/lib/popops/codelets/HeapSortVertexKV.cpp
+++ b/lib/popops/codelets/HeapSortVertexKV.cpp
@@ -126,12 +126,17 @@ class HeapSortVertexKV : public poplar::Vertex {
 
 template class HeapSortVertexKV<float, float>;
 template class HeapSortVertexKV<float, int>;
+template class HeapSortVertexKV<float, unsigned>;
 template class HeapSortVertexKV<float, half>;
 template class HeapSortVertexKV<int, float>;
 template class HeapSortVertexKV<int, int>;
 template class HeapSortVertexKV<int, half>;
+template class HeapSortVertexKV<unsigned, float>;
+template class HeapSortVertexKV<unsigned, int>;
+template class HeapSortVertexKV<unsigned, half>;
 template class HeapSortVertexKV<half, float>;
 template class HeapSortVertexKV<half, int>;
+template class HeapSortVertexKV<half, unsigned>;
 template class HeapSortVertexKV<half, half>;
 
 } // namespace popops
diff --git a/lib/popops/codelets/asm/BroadcastSelect.S b/lib/popops/codelets/asm/BroadcastSelect.S
index 0a28ae97..1965bee0 100644
--- a/lib/popops/codelets/asm/BroadcastSelect.S
+++ b/lib/popops/codelets/asm/BroadcastSelect.S
@@ -14,17 +14,20 @@
 
 #define BCAST_SELECT_FLOAT __runCodelet_popops__BroadcastSelect___float
 #define BCAST_SELECT_INT __runCodelet_popops__BroadcastSelect___int
+#define BCAST_SELECT_UNSIGNED_INT __runCodelet_popops__BroadcastSelect___unsigned_int
 #define BCAST_SELECT_HALF __runCodelet_popops__BroadcastSelect___half
 #define BCAST_SELECT_BOOL __runCodelet_popops__BroadcastSelect___bool
 
 #define BCAST_SELECTOR_SELECT_HALF __runCodelet_popops__BroadcastSelectorSelect___half
 #define BCAST_SELECTOR_SELECT_FLOAT __runCodelet_popops__BroadcastSelectorSelect___float
 #define BCAST_SELECTOR_SELECT_INT __runCodelet_popops__BroadcastSelectorSelect___int
+#define BCAST_SELECTOR_SELECT_UNSIGNED_INT __runCodelet_popops__BroadcastSelectorSelect___unsigned_int
 #define BCAST_SELECTOR_SELECT_BOOL __runCodelet_popops__BroadcastSelectorSelect___bool
 
 #define BCAST_SELECTOR_SELECT_INPLACE_HALF __runCodelet_popops__BroadcastSelectorSelectInPlace___half
 #define BCAST_SELECTOR_SELECT_INPLACE_FLOAT __runCodelet_popops__BroadcastSelectorSelectInPlace___float
 #define BCAST_SELECTOR_SELECT_INPLACE_INT __runCodelet_popops__BroadcastSelectorSelectInPlace___int
+#define BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT __runCodelet_popops__BroadcastSelectorSelectInPlace___unsigned_int
 #define BCAST_SELECTOR_SELECT_INPLACE_BOOL __runCodelet_popops__BroadcastSelectorSelectInPlace___bool
 
 
@@ -34,6 +37,7 @@
 .endm
 
 EXPORT_FN BCAST_SELECT_INT
+EXPORT_FN BCAST_SELECT_UNSIGNED_INT
 EXPORT_FN BCAST_SELECT_FLOAT
 EXPORT_FN BCAST_SELECT_HALF
 EXPORT_FN BCAST_SELECT_BOOL
@@ -41,11 +45,13 @@ EXPORT_FN BCAST_SELECT_BOOL
 EXPORT_FN BCAST_SELECTOR_SELECT_HALF
 EXPORT_FN BCAST_SELECTOR_SELECT_FLOAT
 EXPORT_FN BCAST_SELECTOR_SELECT_INT
+EXPORT_FN BCAST_SELECTOR_SELECT_UNSIGNED_INT
 EXPORT_FN BCAST_SELECTOR_SELECT_BOOL
 
 EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_HALF
 EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_FLOAT
 EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_INT
+EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT
 EXPORT_FN BCAST_SELECTOR_SELECT_INPLACE_BOOL
 
 
@@ -87,6 +93,7 @@ DEF_STACK_USAGE 0 .text.BCAST_SELECT_FLOAT
 .align 8
 
 BCAST_SELECT_INT:
+BCAST_SELECT_UNSIGNED_INT:
 BCAST_SELECT_FLOAT:
   // Load the vertex state. Bail out straight away if no rows to do
   ld32 $nRows, $mvertex_base, $mzero, VERTEX_OUT_ROWS_OFFSET
@@ -544,6 +551,7 @@ DEF_STACK_SIZE_OWN 0 .text.BCAST_SELECTOR_SELECT_FLOAT
 // ---- InPlace entry points
 BCAST_SELECTOR_SELECT_INPLACE_FLOAT:
 BCAST_SELECTOR_SELECT_INPLACE_INT:
+BCAST_SELECTOR_SELECT_INPLACE_UNSIGNED_INT:
   setzi  $BYTES_PER_WORD, 4
   bri    1f
 BCAST_SELECTOR_SELECT_INPLACE_BOOL:
@@ -565,6 +573,7 @@ BCAST_SELECTOR_SELECT_INPLACE_HALF:
 // --- Non-inplace entry points.
 BCAST_SELECTOR_SELECT_FLOAT:
 BCAST_SELECTOR_SELECT_INT:
+BCAST_SELECTOR_SELECT_UNSIGNED_INT:
   setzi  $BYTES_PER_WORD, 4
   bri    2f
 BCAST_SELECTOR_SELECT_BOOL:
diff --git a/lib/popops/codelets/asm/select_int_float.S b/lib/popops/codelets/asm/select_int_float.S
index c33384d8..485c28ee 100644
--- a/lib/popops/codelets/asm/select_int_float.S
+++ b/lib/popops/codelets/asm/select_int_float.S
@@ -4,11 +4,15 @@
 #include "poplar/StackSizeDefs.hpp"
 
 #define SELECT_INT __runCodelet_popops__Select___int
+#define SELECT_UNSIGNED_INT __runCodelet_popops__Select___unsigned_int
 #define SELECT_FLOAT __runCodelet_popops__Select___float
 
 .globl SELECT_INT
 .type SELECT_INT, @function
 
+.globl SELECT_UNSIGNED_INT
+.type SELECT_UNSIGNED_INT, @function
+
 .globl SELECT_FLOAT
 .type SELECT_FLOAT, @function
 
@@ -40,6 +44,7 @@ DEF_STACK_USAGE 0 .text.SELECT_INT
 
 nop // Required for alignment of rpt
 SELECT_INT:
+SELECT_UNSIGNED_INT:
 SELECT_FLOAT:
   // Load the vertex state.
   ld32 $in1, $mvertex_base, $mzero, VERTEX_IN1_OFFSET
diff --git a/lib/popops/codelets/elemwiseMiscCodelets.cpp b/lib/popops/codelets/elemwiseMiscCodelets.cpp
index cc33a081..2cedeb6a 100644
--- a/lib/popops/codelets/elemwiseMiscCodelets.cpp
+++ b/lib/popops/codelets/elemwiseMiscCodelets.cpp
@@ -459,6 +459,7 @@ template <typename InType> class Select : public Vertex {
 template class Select<float>;
 template class Select<half>;
 template class Select<int>;
+template class Select<unsigned>;
 template class Select<bool>;
 
 template <typename InType> class BroadcastClamp : public Vertex {
@@ -517,6 +518,7 @@ template <typename InType> class BroadcastSelect : public Vertex {
 template class BroadcastSelect<float>;
 template class BroadcastSelect<half>;
 template class BroadcastSelect<int>;
+template class BroadcastSelect<unsigned>;
 template class BroadcastSelect<bool>;
 
 // 'Select' ternary operator where the selector (boolean third operand) is a
@@ -544,6 +546,7 @@ template <typename InType> class BroadcastSelectorSelect : public Vertex {
 template class BroadcastSelectorSelect<float>;
 template class BroadcastSelectorSelect<half>;
 template class BroadcastSelectorSelect<int>;
+template class BroadcastSelectorSelect<unsigned>;
 template class BroadcastSelectorSelect<bool>;
 
 template <typename InType> class ClampInPlace : public Vertex {
@@ -615,6 +618,7 @@ template <typename InType> class SelectInPlace : public Vertex {
 template class SelectInPlace<float>;
 template class SelectInPlace<half>;
 template class SelectInPlace<int>;
+template class SelectInPlace<unsigned>;
 template class SelectInPlace<bool>;
 
 template <typename InType>
@@ -640,6 +644,7 @@ class BroadcastSelectorSelectInPlace : public Vertex {
 template class BroadcastSelectorSelectInPlace<float>;
 template class BroadcastSelectorSelectInPlace<half>;
 template class BroadcastSelectorSelectInPlace<int>;
+template class BroadcastSelectorSelectInPlace<unsigned>;
 template class BroadcastSelectorSelectInPlace<bool>;
 
 template <typename InType, bool isAbsolute> class Histogram2D : public Vertex {
diff --git a/lib/popops/popopsCycleEstimators.cpp b/lib/popops/popopsCycleEstimators.cpp
index 4e886ec1..2a526b0a 100644
--- a/lib/popops/popopsCycleEstimators.cpp
+++ b/lib/popops/popopsCycleEstimators.cpp
@@ -2821,6 +2821,20 @@ VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(TransposeSupervisor)(
   return 7 + 6 * maxCycles;
 }
 
+VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(CompareAndSwapAtDistance)(
+    const VertexIntrospector &vertex, const Target &target,
+    const Type &keyType) {
+  // TODO:
+  return 0;
+}
+
+VertexPerfEstimate MAKE_PERF_ESTIMATOR_NAME(CompareAndSwapAtDistanceKeyVal)(
+    const VertexIntrospector &vertex, const Target &target, const Type &keyType,
+    const Type &valueType) {
+  // TODO:
+  return 0;
+}
+
 #define BROADCAST_2TYPE_CYCLE_ESTIM_ENTRIES(vertexName)                        \
   CYCLE_ESTIMATOR_ENTRY(popops, vertexName,                                    \
                         BinaryOpType::VARIANCE_TO_INV_STD_DEV, FLOAT, HALF),   \
@@ -3106,26 +3120,32 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() {
       CYCLE_ESTIMATOR_ENTRY(popops, Select, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, Select, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, Select, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, Select, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, Select, BOOL),
 
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelect, BOOL),
 
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelect, BOOL),
 
       CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, SelectInPlace, BOOL),
 
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace,
+                            UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, BroadcastSelectorSelectInPlace, BOOL),
 
       CYCLE_ESTIMATOR_ENTRY(popops, Histogram2D, FLOAT, true),
@@ -3193,9 +3213,11 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() {
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, INT, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, INT, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, FLOAT, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, INT),
+      CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, UNSIGNED_INT),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, FLOAT),
       CYCLE_ESTIMATOR_ENTRY(popops, HeapSortVertexKV, HALF, HALF),
 
@@ -3236,7 +3258,10 @@ poplibs::PerfEstimatorTable makePerfFunctionTable() {
       CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, HALF),
       CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, UNSIGNED_SHORT),
       CYCLE_ESTIMATOR_ENTRY(popops, TransposeSupervisor, SHORT),
-  };
+
+      CYCLE_ESTIMATOR_ENTRY(popops, CompareAndSwapAtDistance, FLOAT),
+      CYCLE_ESTIMATOR_ENTRY(popops, CompareAndSwapAtDistanceKeyVal, FLOAT,
+                            UNSIGNED_INT)};
 
   for (const auto &entry : unaryOpPerfInfo) {
     table.push_back(CYCLE_ESTIMATOR_ENTRY(popops, UnaryOp2D, entry.first.first,
diff --git a/tests/popops/CMakeLists.txt b/tests/popops/CMakeLists.txt
index 3b74ffb3..5f7c02ba 100644
--- a/tests/popops/CMakeLists.txt
+++ b/tests/popops/CMakeLists.txt
@@ -720,9 +720,14 @@ add_std_operators_test(SelectFloatRHSConst)
 add_std_operators_test(SelectFloatLHSAndRHSConst)
 add_std_operators_test(SelectHalfLHSAndRHSConst)
 add_std_operators_test(SelectInt)
+add_std_operators_test(SelectUInt)
+add_std_operators_test(SelectInPlaceInt)
+add_std_operators_test(SelectInPlaceUInt)
 add_std_operators_test(BroadcastSelectorSelectInt)
+add_std_operators_test(BroadcastSelectorSelectUInt)
 add_std_operators_test(BroadcastSelectorSelectFloat)
 add_std_operators_test(BroadcastSelectorSelectInPlaceInt)
+add_std_operators_test(BroadcastSelectorSelectInPlaceUInt)
 add_std_operators_test(BroadcastSelectorSelectInPlaceFloat)
 add_std_operators_test(ClampFloat)
 add_std_operators_test(ClampFloatMinConst)
@@ -1098,90 +1103,227 @@ foreach(dType half float)
   endforeach()
 endforeach()
 
-add_multitarget_test(NAME topk_f32_u32_n1000_k1000_sorted
-  COMMAND topk
-    --n 1000
-    --k 1000
-    --data-type float
-    --index-type uint
-    --sort-output=1
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f32_u32_n1000_k100_sorted
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type float
-    --index-type uint
-    --sort-output=1
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f32_u32_n1000_k1
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type float
-    --index-type uint
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f32_u32_n1000_k1000_unsorted
-  COMMAND topk
-    --n 1000
-    --k 1000
-    --data-type float
-    --index-type uint
-    --sort-output=0
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f32_u32_n1000_k100_unsorted
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type float
-    --index-type uint
-    --sort-output=0
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f16_u32_n1000_k1000_sorted
-  COMMAND topk
-    --n 1000
-    --k 1000
-    --data-type half
-    --index-type uint
-    --sort-output=1
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f16_u32_n1000_k100_sorted
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type half
-    --index-type uint
-    --sort-output=1
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f16_u32_n1000_k1
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type half
-    --index-type uint
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f16_u32_n1000_k1000_unsorted
-  COMMAND topk
-    --n 1000
-    --k 1000
-    --data-type half
-    --index-type uint
-    --sort-output=0
-    --tiles-per-ipu=16)
-
-add_multitarget_test(NAME topk_f16_u32_n1000_k100_unsorted
-  COMMAND topk
-    --n 1000
-    --k 100
-    --data-type half
-    --index-type uint
-    --sort-output=0
-    --tiles-per-ipu=16)
+
+foreach(DATA_TYPE half float)
+  add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_b0_n1000_k100_edge_case
+    COMMAND topk
+      --n 1000
+      --k 100
+      --b 0
+      --data-type ${DATA_TYPE}
+      --index-type uint
+      --api=popops
+      --tiles-per-ipu=4
+    VARIANTS ${IPUMODEL_VARIANTS})
+  foreach(BATCH_SIZE 1 3)
+    add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1000_sorted
+      COMMAND topk
+        --n 1000
+        --k 1000
+        --batch-size ${BATCH_SIZE}
+        --data-type ${DATA_TYPE}
+        --index-type uint
+        --sort-order=descending
+        --api=popnn
+        --tiles-per-ipu=4)
+
+    add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k100_sorted
+      COMMAND topk
+        --n 1000
+        --k 100
+        --batch-size ${BATCH_SIZE}
+        --data-type ${DATA_TYPE}
+        --index-type uint
+        --sort-order=descending
+        --api=popnn
+        --tiles-per-ipu=4)
+
+    add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1
+      COMMAND topk
+        --n 1000
+        --k 100
+        --batch-size ${BATCH_SIZE}
+        --data-type ${DATA_TYPE}
+        --index-type uint
+        --api=popnn
+        --tiles-per-ipu=4
+      VARIANTS ${IPUMODEL_VARIANTS})
+
+    add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k1000_unsorted
+      COMMAND topk
+        --n 1000
+        --k 1000
+        --batch-size ${BATCH_SIZE}
+        --data-type ${DATA_TYPE}
+        --index-type uint
+        --sort-order=none
+        --api=popnn
+        --tiles-per-ipu=4
+      VARIANTS ${IPUMODEL_VARIANTS})
+
+    add_multitarget_test(NAME topk_popnn_${DATA_TYPE}_unsigned_b${BATCH_SIZE}_n1000_k100_unsorted
+      COMMAND topk
+        --n 1000
+        --k 100
+        --batch-size ${BATCH_SIZE}
+        --data-type ${DATA_TYPE}
+        --index-type uint
+        --sort-order=none
+        --api=popnn
+        --tiles-per-ipu=4
+      VARIANTS ${IPUMODEL_VARIANTS})
+  endforeach()
+
+  foreach(INDICES 0 1)
+    foreach(BATCH_SIZE 1 3)
+      foreach(SORT_ORDER ascending descending)
+
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1_k1_${SORT_ORDER}_edge_case
+          COMMAND topk
+            --n 1
+            --k 1
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --api=popops
+            --sort-order=${SORT_ORDER}
+            --tiles-per-ipu=4
+          VARIANTS ${IPUMODEL_VARIANTS})
+
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1000_${SORT_ORDER}
+          COMMAND topk
+            --n 1000
+            --k 1000
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --sort-order=${SORT_ORDER}
+            --api=popops
+            --tiles-per-ipu=4
+          VARIANTS ${IPUMODEL_VARIANTS})
+
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k1234_${SORT_ORDER}
+          COMMAND topk
+            --n 1234
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --api=popops
+            --sort-order=${SORT_ORDER}
+            --tiles-per-ipu=4)
+
+        foreach(LARGEST 0 1)
+          add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k100_largest${LARGEST}_${SORT_ORDER}
+            COMMAND topk
+              --n 1000
+              --k 100
+              --batch-size ${BATCH_SIZE}
+              --data-type ${DATA_TYPE}
+              --index-type uint
+              --return-indices ${INDICES}
+              --largest ${LARGEST}
+              --sort-order=${SORT_ORDER}
+              --api=popops
+              --tiles-per-ipu=4
+            VARIANTS ${IPUMODEL_VARIANTS})
+
+          add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k123_largest${LARGEST}_${SORT_ORDER}
+            COMMAND topk
+              --n 1234
+              --k 123
+              --batch-size ${BATCH_SIZE}
+              --data-type ${DATA_TYPE}
+              --index-type uint
+              --return-indices ${INDICES}
+              --api=popops
+              --largest ${LARGEST}
+              --sort-order=${SORT_ORDER}
+              --tiles-per-ipu=4)
+        endforeach()
+      endforeach()
+
+      add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k0_edge_case
+        COMMAND topk
+          --n 1000
+          --k 0
+          --batch-size ${BATCH_SIZE}
+          --data-type ${DATA_TYPE}
+          --index-type uint
+          --return-indices ${INDICES}
+          --api=popops
+          --tiles-per-ipu=4
+        VARIANTS ${IPUMODEL_VARIANTS})
+
+      add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1000_unsorted
+        COMMAND topk
+          --n 1000
+          --k 1000
+          --batch-size ${BATCH_SIZE}
+          --data-type ${DATA_TYPE}
+          --index-type uint
+          --return-indices ${INDICES}
+          --sort-order=none
+          --api=popops
+          --tiles-per-ipu=4
+        VARIANTS ${IPUMODEL_VARIANTS})
+
+      add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k1234_unsorted
+        COMMAND topk
+          --n 1234
+          --batch-size ${BATCH_SIZE}
+          --data-type ${DATA_TYPE}
+          --index-type uint
+          --return-indices ${INDICES}
+          --api=popops
+          --sort-order=none
+          --tiles-per-ipu=4
+        VARIANTS ${IPUMODEL_VARIANTS})
+
+      foreach(LARGEST 0 1)
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k100_largest${LARGEST}_unsorted
+          COMMAND topk
+            --n 1000
+            --k 100
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --largest ${LARGEST}
+            --sort-order=none
+            --api=popops
+            --tiles-per-ipu=4
+          VARIANTS ${IPUMODEL_VARIANTS})
+
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1000_k1_largest${LARGEST}
+          COMMAND topk
+            --n 1000
+            --k 100
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --largest ${LARGEST}
+            --api=popops
+            --tiles-per-ipu=4
+          VARIANTS ${IPUMODEL_VARIANTS})
+
+        add_multitarget_test(NAME topk_popops_${DATA_TYPE}_unsigned_indices${INDICES}_b${BATCH_SIZE}_n1234_k123_largest${LARGEST}_unsorted
+          COMMAND topk
+            --n 1234
+            --k 123
+            --batch-size ${BATCH_SIZE}
+            --data-type ${DATA_TYPE}
+            --index-type uint
+            --return-indices ${INDICES}
+            --api=popops
+            --largest ${LARGEST}
+            --sort-order=none
+            --tiles-per-ipu=4
+          VARIANTS ${IPUMODEL_VARIANTS})
+      endforeach()
+    endforeach()
+  endforeach()
+endforeach()
diff --git a/tests/popops/SortTest.cpp b/tests/popops/SortTest.cpp
index f7bc5b5b..0e3f965f 100644
--- a/tests/popops/SortTest.cpp
+++ b/tests/popops/SortTest.cpp
@@ -5,6 +5,7 @@
 
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int_distribution.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
 #include <boost/test/unit_test.hpp>
 
 #include <poplar/Engine.hpp>
@@ -403,3 +404,104 @@ BOOST_AUTO_TEST_CASE(DeviceSortKVInt) {
     BOOST_CHECK(std::is_sorted(begin, end));
   }
 }
+
+BOOST_AUTO_TEST_CASE(DeviceSortKVFloatUInt) {
+  std::array<float, 64> key;
+  std::array<unsigned, 64> value;
+  boost::random::mt19937 gen;
+  boost::random::uniform_real_distribution<> dist(-1024, 1024);
+  std::generate(std::begin(key), std::end(key), std::bind(dist, gen));
+  std::iota(value.begin(), value.end(), 0);
+  auto out = deviceSortKV(key, value);
+
+  // Check that we have the same elements in some order
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order
+  std::array<float, 64> keyPermuted;
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  BOOST_CHECK(std::is_sorted(std::begin(keyPermuted), std::begin(keyPermuted)));
+
+  out = deviceSortKV(key, value, {4, 4, 4}, 2);
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order on the specified dimension
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const auto begin = keyPermuted.data() + (i * 16 + j * 4);
+      const auto end = keyPermuted.data() + (i * 16 + (j + 1) * 4);
+
+      BOOST_CHECK(std::is_sorted(begin, end));
+    }
+  }
+
+  out = deviceSortKV(key, value, {4, 4, 4}, 1);
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order on the specified dimension
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        BOOST_CHECK(keyPermuted[i * 16 + j * 4 + k] <=
+                    keyPermuted[i * 16 + (j + 1) * 4 + k]);
+      }
+    }
+  }
+
+  out = deviceSortKV(key, value, {4, 4, 4}, 0);
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order on the specified dimension
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        BOOST_CHECK(keyPermuted[i * 16 + j * 4 + k] <=
+                    keyPermuted[(i + 1) * 16 + j * 4 + k]);
+      }
+    }
+  }
+
+  out = deviceSortKV(key, value, {16, 4}, 0);
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order on the specified dimension
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  for (int i = 0; i < 15; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      BOOST_CHECK(keyPermuted[i * 4 + j] <= keyPermuted[(i + 1) * 4 + j]);
+    }
+  }
+
+  out = deviceSortKV(key, value, {16, 4}, 1);
+  BOOST_CHECK(
+      std::is_permutation(std::begin(value), std::end(value), std::begin(out)));
+
+  // Check that the elements are in sorted order on the specified dimension
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    keyPermuted[i] = key[out[i]];
+  }
+  for (int i = 0; i < 15; ++i) {
+    const auto begin = keyPermuted.data() + (i * 4);
+    const auto end = keyPermuted.data() + ((i + 1) * 4);
+
+    BOOST_CHECK(std::is_sorted(begin, end));
+  }
+}
diff --git a/tests/popops/StdOperatorsTest.cpp b/tests/popops/StdOperatorsTest.cpp
index 15555799..3e1d91bd 100644
--- a/tests/popops/StdOperatorsTest.cpp
+++ b/tests/popops/StdOperatorsTest.cpp
@@ -186,6 +186,26 @@ static void setBinaryOpInputs(int hIn1[DIM_SIZE][DIM_SIZE],
   }
 }
 
+static void setBinaryOpInputs(unsigned hIn1[DIM_SIZE][DIM_SIZE],
+                              unsigned hIn2[DIM_SIZE][DIM_SIZE],
+                              bool isShift = false) {
+  int val1 = 25;
+  int val2 = 59;
+  for (auto r = 0U; r != DIM_SIZE; ++r) {
+    for (auto c = 0U; c != DIM_SIZE; ++c) {
+      hIn1[r][c] = (1 - 2 * (r & 1)) * (r + val1);
+      hIn2[r][c] = (1 - 2 * ((r + c) & 1)) * (r + c + val2);
+
+      // Shifting by more than 32 is undefined.
+      if (isShift) {
+        hIn2[r][c] = hIn2[r][c] % 32;
+        if (hIn2[r][c] < 0)
+          hIn2[r][c] = -hIn2[r][c];
+      }
+    }
+  }
+}
+
 template <typename T> void convertToPositive(T array[DIM_SIZE][DIM_SIZE]) {
   for (auto i = 0U; i < DIM_SIZE; ++i) {
     for (auto j = 0U; j < DIM_SIZE; ++j) {
@@ -399,7 +419,7 @@ void powTest() {
   }
 }
 
-template <typename InType> void selectTest() {
+template <typename InType> void selectTest(bool inPlace = false) {
   auto type = equivalent_device_type<InType>().value;
   auto device = createTestDevice(deviceType);
   Graph graph(device.getTarget());
@@ -420,7 +440,12 @@ template <typename InType> void selectTest() {
   Tensor in3 = mapUnaryOpTensor(graph, BOOL);
 
   auto prog = Sequence();
-  auto out = select(graph, in1, in2, in3, prog);
+  Tensor out = in1;
+  if (!inPlace) {
+    out = select(graph, in1, in2, in3, prog);
+  } else {
+    selectInPlace(graph, in1, in2, in3, prog);
+  }
   graph.createHostWrite("in1", in1);
   graph.createHostWrite("in2", in2);
   graph.createHostWrite("in3", in3);
@@ -2087,13 +2112,23 @@ int main(int argc, char **argv) {
   } else if (test == "SelectHalfLHSAndRHSConst") {
     selectTestHalfLHSAndRHSConst();
   } else if (test == "SelectInt") {
-    selectTest<int>();
+    selectTest<int>(false);
+  } else if (test == "SelectUInt") {
+    selectTest<unsigned>(false);
+  } else if (test == "SelectInPlaceInt") {
+    selectTest<int>(true);
+  } else if (test == "SelectInPlaceUInt") {
+    selectTest<unsigned>(true);
   } else if (test == "BroadcastSelectorSelectInt") {
     broadcastSelectorSelectTest<int>(false);
+  } else if (test == "BroadcastSelectorSelectUInt") {
+    broadcastSelectorSelectTest<unsigned>(false);
   } else if (test == "BroadcastSelectorSelectFloat") {
     broadcastSelectorSelectTest<float>(false);
   } else if (test == "BroadcastSelectorSelectInPlaceInt") {
     broadcastSelectorSelectTest<int>(true);
+  } else if (test == "BroadcastSelectorSelectInPlaceUInt") {
+    broadcastSelectorSelectTest<unsigned>(true);
   } else if (test == "BroadcastSelectorSelectInPlaceFloat") {
     broadcastSelectorSelectTest<float>(true);
   } else if (test == "ClampFloat") {
diff --git a/tests/popops/codelets/BroadcastSelect.cpp.in b/tests/popops/codelets/BroadcastSelect.cpp.in
index 1a8784ad..9881e463 100644
--- a/tests/popops/codelets/BroadcastSelect.cpp.in
+++ b/tests/popops/codelets/BroadcastSelect.cpp.in
@@ -1,3 +1,5 @@
+// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
+
 // Tests for 'BroadcastSelect' and 'BroadcastSelectorSelect[InPlace]' codelets.
 // This file will create 3 boost test cases (one for each codelet).
 //
@@ -38,6 +40,8 @@
 typedef float TEST_TYPE;
 #elif defined(DATA_TYPE_bool)
 typedef unsigned char TEST_TYPE;
+#elif defined(DATA_TYPE_unsigned_int)
+typedef unsigned TEST_TYPE;
 #else
 typedef @DATA_TYPE@ TEST_TYPE;
 #endif
@@ -211,7 +215,7 @@ BOOST_AUTO_TEST_CASE(BROADCAST_SELECT_TYPE) {
   BroadcastSelectTest(false, false, true);
   BroadcastSelectTest(true, true, true);
   BroadcastSelectTest(false, false, true);
-#elif defined(DATA_TYPE_int)
+#elif defined(DATA_TYPE_int) || defined(DATA_TYPE_unsigned_int)
   BroadcastSelectTest(0xcafebabe, 0xdeadbeef, 0x12345678);
 #else
   BroadcastSelectTest(31.4159, 27.1828, 666.0);
diff --git a/tests/popops/codelets/CMakeLists.txt b/tests/popops/codelets/CMakeLists.txt
index 2045651a..9c056b69 100644
--- a/tests/popops/codelets/CMakeLists.txt
+++ b/tests/popops/codelets/CMakeLists.txt
@@ -384,7 +384,7 @@ foreach(TYPE half float)
 endforeach()
 
 # Select
-foreach(DATA_TYPE int float half bool)
+foreach(DATA_TYPE int unsigned_int float half bool)
   set(VARIANT_NAME "Select_${DATA_TYPE}")
   set(VARIANT_SRC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT_NAME}.cpp")
   string(TOUPPER "${DATA_TYPE}" TYPE)
@@ -395,7 +395,7 @@ foreach(DATA_TYPE int float half bool)
 endforeach()
 
 # BroadcastSelect
-foreach(DATA_TYPE int float half bool)
+foreach(DATA_TYPE int unsigned_int float half bool)
   set(VARIANT_NAME "BroadcastSelect_${DATA_TYPE}")
   set(VARIANT_SRC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT_NAME}.cpp")
   string(TOUPPER "${DATA_TYPE}" TYPE)
diff --git a/tests/popops/codelets/select/unsigned_int.hpp b/tests/popops/codelets/select/unsigned_int.hpp
new file mode 100644
index 00000000..3438dcd2
--- /dev/null
+++ b/tests/popops/codelets/select/unsigned_int.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 Graphcore Ltd. All rights reserved.
+const std::vector<std::vector<int>> in1 = {{},
+                                           {11},
+                                           {31, 33},
+                                           {51, 53, 55},
+                                           {71, 73, 75, 77},
+                                           {91, 93, 95, 97, 99},
+                                           {1, 3, 5, 7, 9, 21, 23, 25, 27}};
+
+const std::vector<std::vector<int>> in2 = {{},
+                                           {20},
+                                           {40, 42},
+                                           {60, 62, 64},
+                                           {80, 82, 84, 86},
+                                           {100, 102, 104, 106, 108},
+                                           {2, 4, 6, 8, 10, 22, 24, 26, 28}};
+
+const std::vector<std::vector<int>> expected = {
+    {},
+    {11},
+    {40, 33},
+    {60, 53, 64},
+    {71, 82, 75, 86},
+    {91, 93, 104, 106, 99},
+    {2, 4, 6, 7, 9, 21, 24, 25, 28}};
diff --git a/tools/topk.cpp b/tools/topk.cpp
index e6744fee..b598b12d 100644
--- a/tools/topk.cpp
+++ b/tools/topk.cpp
@@ -2,14 +2,20 @@
 
 #include <iostream>
 
+#include <poplar/CycleCount.hpp>
 #include <poplar/Engine.hpp>
 #include <poplar/Program.hpp>
 #include <poplar/exceptions.hpp>
 #include <poplibs_support/TestDevice.hpp>
+#include <poplibs_support/print.hpp>
 #include <poplibs_test/Util.hpp>
 #include <popnn/Loss.hpp>
 #include <popnn/codelets.hpp>
+#include <popops/Sort.hpp>
+#include <popops/TopK.hpp>
+#include <popops/codelets.hpp>
 #include <poputil/TileMapping.hpp>
+#include <poputil/exceptions.hpp>
 
 #include <boost/multi_array.hpp>
 #include <boost/program_options.hpp>
@@ -18,12 +24,65 @@ using namespace poplar;
 using namespace poplar::program;
 using namespace poplibs_support;
 using namespace poplibs_test::util;
+using namespace poputil;
 
 #define FLOAT_REL_TOL 1e-6
 #define HALF_REL_TOL 1e-5
 #define FLOAT_ABS_TOL 1e-6
 #define HALF_ABS_TOL 1e-5
 
+enum class API {
+  Popops,
+  Popnn,
+  PopopsSort,
+};
+
+inline std::ostream &operator<<(std::ostream &os, const API &api) {
+  switch (api) {
+  case API::Popops:
+    os << "popops";
+    break;
+  case API::Popnn:
+    os << "popnn";
+    break;
+  case API::PopopsSort:
+    os << "popops-sort";
+  default:
+    throw poplibs_error("Unhandled API type");
+  }
+  return os;
+}
+
+inline std::istream &operator>>(std::istream &is, API &api) {
+  std::string token;
+  is >> token;
+  if (token == "popops") {
+    api = API::Popops;
+  } else if (token == "popnn") {
+    api = API::Popnn;
+  } else if (token == "popops-sort") {
+    api = API::PopopsSort;
+  } else {
+    throw poplibs_error("Unknown API type '" + token + "'");
+  }
+  return is;
+}
+
+inline std::istream &operator>>(std::istream &is, popops::SortOrder &o) {
+  std::string token;
+  is >> token;
+  if (token == "none") {
+    o = popops::SortOrder::NONE;
+  } else if (token == "ascending") {
+    o = popops::SortOrder::ASCENDING;
+  } else if (token == "descending") {
+    o = popops::SortOrder::DESCENDING;
+  } else {
+    throw poplibs_error("Unknown sort order '" + token + "'");
+  }
+  return is;
+}
+
 int main(int argc, char **argv) try {
 
   namespace po = boost::program_options;
@@ -33,7 +92,11 @@ int main(int argc, char **argv) try {
   boost::optional<unsigned> tilesPerIPU;
   Type dataType = FLOAT;
   Type indexType = UNSIGNED_INT;
-  bool sortOutput = true;
+  bool largest = true;
+  popops::SortOrder sortOrder = popops::SortOrder::ASCENDING;
+  API api = API::Popops;
+  bool returnIndices = true;
+  bool returnValues = true;
 
   po::options_description desc("Options");
   // clang-format off
@@ -52,6 +115,8 @@ int main(int argc, char **argv) try {
       ("ignore-data",
        "Don't upload/download to/from the device and consequently don't "
        "validate results")
+      ("report-total-cycles",
+       "Print total cycle count for the whole operation")
       ("n",
        po::value(&n)->required(),
        "Number of input elements")
@@ -68,9 +133,23 @@ int main(int argc, char **argv) try {
       ("index-type",
        po::value(&indexType)->default_value(indexType),
        "The type of the indices")
-      ("sort-output",
-       po::value(&sortOutput)->default_value(sortOutput),
-       "Ensure the output of the top-k is sorted")
+      ("largest",
+       po::value(&largest)->default_value(largest),
+       "If true return the top k largest elements, otherwise return top k smallest elements")
+      ("sort-order",
+       po::value(&sortOrder)->default_value(sortOrder),
+       "Sort order of the output of the top-k")
+      ("return-indices",
+       po::value(&returnIndices)->default_value(returnIndices),
+       "Use API returning indices of top k values")
+      ("return-values",
+       po::value(&returnValues)->default_value(returnValues),
+       "Use API returning top k values")
+      ("api",
+       po::value(&api)->default_value(api),
+       "Which API to use (popops | popnn)")
+      ("random-seed",
+       "Use a random seed")
   ;
   // clang-format on
 
@@ -90,24 +169,82 @@ int main(int argc, char **argv) try {
   }
 
   bool profile = vm.count("profile");
-  bool ignoreData = vm.count("ignore-data");
   bool showVarStorage = vm.count("show-var-storage");
+  bool reportTotalCycles = vm.count("report-total-cycles");
+  bool randomSeed = vm.count("random-seed");
+
+  if (profile && reportTotalCycles) {
+    std::cerr
+        << "Can't report total cycles and profile at the same time as "
+           "profiling instrumentation would skew total cycles measurement\n";
+    return 1;
+  }
 
   // If k was not explicitly provided, set it equal to n
   if (!vm.count("k")) {
     k = n;
   }
 
-  std::cout << "Top-K with batch-size " << batchSize << ", input size " << n
-            << ", and output size " << k << "\n";
+  if (!returnIndices && !returnValues) {
+    std::cerr
+        << "At least one of return-indices and return-values must be true\n";
+    return 1;
+  }
+
+  switch (api) {
+  case API::Popops:
+    // Nothing. Popops API supports all arguments.
+    break;
+  case API::Popnn:
+    if (!returnIndices) {
+      std::cerr << "Warning: popnn API only supports returning both indices "
+                   "and values. Forcing return-indices on\n";
+      returnIndices = true;
+    }
+    if (!returnValues) {
+      std::cerr << "Warning: popnn API only supports returning both indices "
+                   "and values. Forcing return-values on\n";
+      returnValues = true;
+    }
+    if (sortOrder == popops::SortOrder::ASCENDING) {
+      std::cerr << "Warning: popnn API only supports returning values sorted "
+                   "in descending order. Forcing sort-order to Descending\n";
+      sortOrder = popops::SortOrder::DESCENDING;
+    }
+    if (!largest) {
+      std::cerr << "Warning: popnn API only supports returning top k largest "
+                   "values. Forcing largest to true\n";
+      largest = true;
+    }
+    break;
+  case API::PopopsSort:
+    if (n != k) {
+      std::cerr << "Warning: popops sort API only supports full sort. Forcing "
+                   "k equal to n\n";
+      k = n;
+    }
+    if (returnIndices && returnValues) {
+      std::cerr << "Warning: popops sort API only supports returning either "
+                   "keys or values not both. Forcing returnValues to false\n";
+      returnValues = false;
+    }
+    break;
+  }
 
   constexpr bool alwaysCompileCode = true;
   auto device = createTestDevice(deviceType, 1, tilesPerIPU, alwaysCompileCode);
   const auto &target = device.getTarget();
   Graph graph(target);
-  popnn::addCodelets(graph);
+  if (api == API::Popnn) {
+    popnn::addCodelets(graph);
+  } else if (api == API::Popops || api == API::PopopsSort) {
+    popops::addCodelets(graph);
+  }
 
-  const auto in = graph.addVariable(dataType, {batchSize, n}, "in");
+  const std::vector<std::size_t> inShape = {batchSize, n};
+  const std::vector<std::size_t> outShape = {batchSize, k};
+
+  const auto in = graph.addVariable(dataType, inShape, "in");
   // TODO: Eventually we should have an allocation function for the inputs
   // which will probably just map linearly with some kind of grain size.
   poputil::mapTensorLinearly(graph, in);
@@ -116,15 +253,69 @@ int main(int argc, char **argv) try {
   Sequence prog, uploadProg, downloadProg;
 
   Tensor outIndices;
-  Tensor outValues =
-      popnn::topK(graph, in, outIndices, k, sortOutput, prog, "top-k");
+  Tensor outValues;
+  if (api == API::Popnn) {
+    const bool sorted = sortOrder != popops::SortOrder::NONE;
+    outValues = popnn::topK(graph, in, outIndices, k, sorted, prog, "top-k");
+    // Weirdly this interface seems to return the partials dimension as part of
+    // the result, even though this is always 1?
+    outValues = outValues.squeeze({1});
+    outIndices = outIndices.squeeze({1});
+  } else if (api == API::Popops) {
+    const popops::TopKParams params(k, largest, sortOrder);
+    if (returnIndices) {
+      std::tie(outValues, outIndices) =
+          popops::topKWithPermutation(graph, prog, in, params, "top-k");
+    } else {
+      outValues = popops::topK(graph, prog, in, params, "top-k");
+    }
+  } else if (api == API::PopopsSort) {
+    if (returnValues) {
+      outValues = popops::sort(graph, in, 1, prog, "top-k");
+    } else {
+      std::vector<unsigned> batchIndices(n);
+      std::iota(batchIndices.begin(), batchIndices.end(), 0);
+      const auto iota = graph.addConstant(
+          indexType, {1, n}, ArrayRef(batchIndices), "indicesInitializer");
+      poputil::mapTensorLinearly(graph, iota);
+      auto indices = iota.broadcast(batchSize, 0);
+      outIndices = popops::sortKeyValue(graph, in, indices, 1, prog, "top-k");
+    }
+  }
+
+  // Check types/shapes returned by the API
+  if (returnIndices) {
+    if (outIndices.elementType() != indexType) {
+      std::cerr << "Actual index type (" << outIndices.elementType()
+                << ") is not the requested index type ( " << indexType << ")\n";
+      return 1;
+    }
+    if (outIndices.shape() != outShape) {
+      std::cerr << "Shape of returned indices (" << outIndices.shape()
+                << ") does not match the expected shape (" << outShape << ")\n";
+      return 1;
+    }
+  }
+  if (returnValues) {
+    if (outValues.elementType() != dataType) {
+      std::cerr << "Actual value type (" << outIndices.elementType()
+                << ") is not the requested value type ( " << dataType << ")\n";
+    }
+    if (outValues.shape() != outShape) {
+      std::cerr << "Shape of returned values (" << outValues.shape()
+                << ") does not match the expected shape (" << outShape << ")\n";
+      return 1;
+    }
+  }
 
   std::unique_ptr<char[]> rawHostIn, rawHostIndicesOut, rawHostValuesOut;
-  if (!ignoreData) {
-    rawHostIn = allocateHostMemoryForTensor(in, "in", graph, uploadProg,
-                                            downloadProg, tmap);
+  rawHostIn = allocateHostMemoryForTensor(in, "in", graph, uploadProg,
+                                          downloadProg, tmap);
+  if (returnIndices) {
     rawHostIndicesOut = allocateHostMemoryForTensor(
         outIndices, "outIndices", graph, uploadProg, downloadProg, tmap);
+  }
+  if (returnValues) {
     rawHostValuesOut = allocateHostMemoryForTensor(
         outValues, "outValues", graph, uploadProg, downloadProg, tmap);
   }
@@ -134,6 +325,12 @@ int main(int argc, char **argv) try {
     engineOptions.set("debug.instrument", "true");
   }
 
+  Tensor cycleCounter;
+  if (reportTotalCycles) {
+    cycleCounter = cycleCount(graph, prog, 0, "measure-total-cycles");
+    graph.createHostRead("totalCycleCount", cycleCounter);
+  }
+
   Engine engine(graph, Sequence(uploadProg, prog, downloadProg), engineOptions);
   attachStreams(engine, tmap);
 
@@ -142,58 +339,99 @@ int main(int argc, char **argv) try {
   std::vector<double> hostValuesOut(batchSize * k);
 
   std::mt19937 randomEngine;
-  if (!ignoreData) {
-    writeRandomValues(target, dataType, hostIn, -50.0, 50.0, randomEngine);
-    copy(target, hostIn, dataType, rawHostIn.get());
+  if (randomSeed) {
+    const auto seed = std::random_device{}();
+    std::cout << "Seeding random engine with seed " << seed << "\n";
+    randomEngine.seed(seed);
   }
+  // TODO: Check what happens with NaN values...
+  writeRandomValues(target, dataType, hostIn, -50.0, 50.0, randomEngine);
+  copy(target, hostIn, dataType, rawHostIn.get());
 
-  device.bind([&](const Device &d) { engine.loadAndRun(d); });
+  device.bind([&](const Device &d) {
+    engine.loadAndRun(d);
+    if (reportTotalCycles) {
+      std::uint64_t cycleCount;
+      engine.readTensor("totalCycleCount", &cycleCount);
+      std::cout << "Total cycles for top-k program were " << cycleCount << "\n";
+    }
+  });
 
   bool matchesModel = true;
-  if (!ignoreData) {
+  if (returnIndices) {
     copy(target, indexType, rawHostIndicesOut.get(), hostIndicesOut);
+  }
+  if (returnValues) {
     copy(target, dataType, rawHostValuesOut.get(), hostValuesOut);
-    // Verify against top-k on the host.
-    std::vector<unsigned> modelIndicesOut(batchSize * k);
-    std::vector<double> modelValuesOut(batchSize * k);
-    {
-      std::vector<unsigned> indices(n);
-      for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) {
-        std::iota(indices.begin(), indices.end(), 0);
-        std::partial_sort(indices.begin(), indices.begin() + k, indices.end(),
-                          [&](unsigned a, unsigned b) {
-                            return hostIn[batchIdx * n + a] >
-                                   hostIn[batchIdx * n + b];
-                          });
-        for (unsigned i = 0; i < k; ++i) {
-          modelIndicesOut[batchIdx * k + i] = indices[i];
-          modelValuesOut[batchIdx * k + i] = hostIn[batchIdx * n + indices[i]];
-        }
+  }
+
+  // Verify against top-k on the host.
+  const auto partialSortComparator =
+      [&]() -> std::function<bool(double, double)> {
+    if (largest) {
+      return std::greater<double>{};
+    } else {
+      return std::less<double>{};
+    }
+  }();
+
+  std::vector<unsigned> modelIndicesOut(batchSize * k);
+  std::vector<double> modelValuesOut(batchSize * k);
+  {
+    std::vector<unsigned> indices(n);
+    for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) {
+      std::iota(indices.begin(), indices.end(), 0);
+      std::partial_sort(indices.begin(), indices.begin() + k, indices.end(),
+                        [&](unsigned a, unsigned b) {
+                          return partialSortComparator(
+                              hostIn[batchIdx * n + a],
+                              hostIn[batchIdx * n + b]);
+                        });
+      // Result of the partial sort will be ordered differently depending on
+      // whether we wanted the largest or smallest top-k elements, so ensure the
+      // requested correct ordering.
+      if ((largest && sortOrder == popops::SortOrder::ASCENDING) ||
+          (!largest && sortOrder == popops::SortOrder::DESCENDING)) {
+        std::reverse(indices.begin(), indices.begin() + k);
+      }
+      for (unsigned i = 0; i < k; ++i) {
+        modelIndicesOut[batchIdx * k + i] = indices[i];
+        modelValuesOut[batchIdx * k + i] = hostIn[batchIdx * n + indices[i]];
       }
     }
+  }
 
-    // If the output isn't already supposed to be sorted, sort it on the host
-    // so that we can compare element for element to check the result.
-    if (!sortOutput) {
-      std::vector<unsigned> sortedIndices(k);
-      std::vector<double> valBuffer(k);
-      std::vector<unsigned> indexBuffer(k);
-      for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) {
-        std::iota(sortedIndices.begin(), sortedIndices.end(), 0);
-        std::sort(sortedIndices.begin(), sortedIndices.end(),
-                  [&](unsigned a, unsigned b) {
-                    return hostValuesOut[batchIdx * k + a] >
-                           hostValuesOut[batchIdx * k + b];
-                  });
+  // If the output isn't already supposed to be sorted, sort it on the host
+  // so that we can compare element for element to check the result.
+  if (sortOrder == popops::SortOrder::NONE) {
+    std::vector<unsigned> sortedIndices(k);
+    std::vector<double> valBuffer(k);
+    std::vector<unsigned> indexBuffer(k);
+    for (unsigned batchIdx = 0; batchIdx < batchSize; ++batchIdx) {
+      std::iota(sortedIndices.begin(), sortedIndices.end(), 0);
+      std::sort(sortedIndices.begin(), sortedIndices.end(),
+                [&](unsigned a, unsigned b) {
+                  return partialSortComparator(hostValuesOut[batchIdx * k + a],
+                                               hostValuesOut[batchIdx * k + b]);
+                });
+      if (returnIndices) {
         std::copy_n(&hostIndicesOut[batchIdx * k], k, indexBuffer.begin());
-        std::copy_n(&hostValuesOut[batchIdx * k], k, valBuffer.begin());
         for (unsigned i = 0; i < k; ++i) {
           hostIndicesOut[batchIdx * k + i] = indexBuffer[sortedIndices[i]];
+        }
+      }
+      if (returnValues) {
+        std::copy_n(&hostValuesOut[batchIdx * k], k, valBuffer.begin());
+        for (unsigned i = 0; i < k; ++i) {
           hostValuesOut[batchIdx * k + i] = valBuffer[sortedIndices[i]];
         }
       }
     }
+  }
 
+  double relTolerance = dataType == FLOAT ? FLOAT_REL_TOL : HALF_REL_TOL;
+  double absTolerance = dataType == FLOAT ? FLOAT_ABS_TOL : HALF_ABS_TOL;
+  if (returnIndices) {
     // Because 2 values might be equal and therefore the order of the indices
     // is not well defined, we don't directly check the indices but instead
     // check the values the indices point to match the data.
@@ -211,11 +449,11 @@ int main(int argc, char **argv) try {
         }
       }
     }
-    double relTolerance = dataType == FLOAT ? FLOAT_REL_TOL : HALF_REL_TOL;
-    double absTolerance = dataType == FLOAT ? FLOAT_ABS_TOL : HALF_ABS_TOL;
     matchesModel &= checkIsClose("indexedValues", indexedValues.data(),
                                  {batchSize, k}, modelValuesOut.data(),
                                  batchSize * k, relTolerance, absTolerance);
+  }
+  if (returnValues) {
     matchesModel &= checkIsClose("values", hostValuesOut.data(), {batchSize, k},
                                  modelValuesOut.data(), batchSize * k,
                                  relTolerance, absTolerance);