damageboy · damageboy · May 11, 2023 · May 11, 2023 · May 14, 2023 · May 14, 2023
diff --git a/.clang-format b/.clang-format
@@ -3,7 +3,7 @@ BasedOnStyle: Chromium
 
 ---
 Language: Cpp
-ColumnLimit: 160
+ColumnLimit: 100
 IndentWidth: 4
 
 ...
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -50,6 +50,15 @@ jobs:
       with:
         arch: x64
 
+    # Work around https://github.com/actions/runner-images/issues/8659
+    - name: "Remove GCC 13 from runner image (workaround)"
+      shell: bash
+      if: startsWith(runner.os, 'Linux')
+      run: |
+        sudo rm -f /etc/apt/sources.list.d/ubuntu-toolchain-r-ubuntu-test-jammy.list
+        sudo apt-get update
+        sudo apt-get install -y --allow-downgrades libc6=2.35-0ubuntu3.4 libc6-dev=2.35-0ubuntu3.4 libstdc++6=12.3.0-1ubuntu1~22.04 libgcc-s1=12.3.0-1ubuntu1~22.04
+
     - name: Setup Ninja
       uses: ashutoshvarma/setup-ninja@master
       with:
@@ -187,6 +196,8 @@ jobs:
     - name: Test Report
       uses: dorny/test-reporter@v1
       if:  steps.check_cpu.outputs.has_avx2 == 1 || steps.check_cpu.outputs.has_avx512 == 1
+      env:
+        NODE_OPTIONS: --max-old-space-size=4096 --stack-size=2048
       with:
         name: tests/${{ matrix.config.name}}
         path: build/tests/junit/*.xml

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(VXSORT_USE_LINKER "" CACHE STRING "Custom linker for -fuse-ld=...")
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM AND ${VXSORT_CCACHE})
     message("ccache detected - using ccache to cache object files across compilations")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")    
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
 endif()
 
 # Make sure we can import out CMake functions
@@ -203,8 +203,8 @@ CPMAddPackage(
 CPMAddPackage(
         NAME googletest
         GITHUB_REPOSITORY google/googletest
-        GIT_TAG v1.13.0
-        VERSION 1.13.0
+        GIT_TAG v1.14.0
+        VERSION 1.14.0
         OPTIONS "BUILD_GMOCK OFF" "INSTALL_GTEST OFF" "gtest_force_shared_crt"
         OVERRIDE_FIND_PACKAGE
 )
@@ -214,7 +214,8 @@ CPMAddPackage(
         GIT_TAG main
         OPTIONS "BUILD_TESTING OFF"
 )
-CPMAddPackage("gh:fmtlib/fmt#9.1.0")
+CPMAddPackage("gh:fmtlib/fmt#10.1.1")
+CPMAddPackage("gh:Neargye/magic_enum#v0.9.2")
 CPMAddPackage("gh:okdshin/PicoSHA2#master")
 
 enable_testing()

diff --git a/README.md b/README.md
@@ -1,32 +1,53 @@
 # vxsort-cpp
 
+## Tests
+
 [![Build and Test](https://github.com/damageboy/vxsort-cpp/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/damageboy/vxsort-cpp/actions/workflows/build-and-test.yml)
 ![Latest Test Status](https://gist.githubusercontent.com/damageboy/dfd9d01f2c710f96b444532b92539321/raw/vxsort-suites-badge.svg)
 ![Latest Test Status](https://gist.githubusercontent.com/damageboy/dfd9d01f2c710f96b444532b92539321/raw/vxsort-tests-badge.svg)
 ![Latest Test Status](https://gist.githubusercontent.com/damageboy/dfd9d01f2c710f96b444532b92539321/raw/vxsort-runs-badge.svg)
 
 ## What
 
-This is a port of the C# [VxSort](https://github.com/damageboy/VxSort/) to high-perf C++.
+vxsort is a fast, somewhat novel, hybrid, vectorized quicksort+bitonic primitive sorter implemented in C++.  
+The name vxsort stands for <ins>v</ins>ectorized 10<ins>x</ins> <ins>sort</ins>.
+It currently supports the following combination of vector ISA and primitive types:
+
+|        | i64 | i32 | i16           | u64 | u32 | u16           | f64 | f32 | f16 |
+|--------|-----|-----|---------------|-----|-----|---------------|-----|-----|-----|
+| AVX2   | ✅   | ✅   | ✅             | ✅   | ✅   | ✅             | ✅   | ✅   | ❌   |
+| AVX512 | ✅   | ✅   | ✅<sup>1</sup> | ✅   | ✅   | ✅<sup>1</sup> | ✅   | ✅   | ❌   |
+| ARM-Neon|  ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   |
+| ARM-SVE2|  ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   |
+| RiscV-V 1.0 |  ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   | ❌   |
+
+<sup>1</sup> - Requires AVX512/VBMI2 support which is available for all Intel AVX512 CPUs post Icelake, AMD post Zen4 
+
+
+## Benchmark Results
 
 ## Building
 
-```bash
-mkdir build-release
-cd build-release
+```shell
+mkdir build
+cd build
 # For better code-gen
-export CC=clang
-export CXX=clang++
-cmake ..
-make -j 4
+export CC=clang CXX=clang++
+cmake .. -G Ninja
+ninja
 ```
 
 ## Testing
 
+To run tests, use ctest, preferrably with `-J $(nproc)` to avoid waiting for a long time: 
+
 ```bash
-./test/vxsort_test
+ctest -J $(nproc)
 ```
 
+Tests are built into 3 executables (signed integers, unsigned integer, floating-point) per supported vector ISA.
+This allows for easy to expolit parallelization both whne building and executing the tests.
+
 ## Benchmarking
 
 1. Plug in to a power-source if on laptop
@@ -42,149 +63,4 @@ make -j 4
 ./bench/run.sh
 ```
 
-## Results (Ryzen 3950X, 3.8Ghz)
-
-### int64
-
-Compared to Introspective Sort, we can hit:
-* For 1M `int64` elements, roughly 4.8x improvement with 8x unroll (`55ns` per element -> `11.5ns`)
-* For 128K `int64` elements, roughly 4.5x improvement with 8x unroll (`45ns` per element -> `10.5ns`)
-
-#### Introspective Sort (Scalar Baseline):
-
-```bash
------------------------------------------------------------------------------------------
-Benchmark                                    Time             CPU   Iterations     Time/N
------------------------------------------------------------------------------------------
-BM_full_introsort/4096                    1.18 ms         1.18 ms          586  28.8719ns
-BM_full_introsort/8192                    2.78 ms         2.77 ms          262  33.8639ns
-BM_full_introsort/16384                   5.86 ms         5.86 ms          120  35.7374ns
-BM_full_introsort/32768                   12.4 ms         12.4 ms           54   37.948ns
-BM_full_introsort/65536                   27.7 ms         27.6 ms           25  42.1669ns
-BM_full_introsort/131072                  59.3 ms         59.3 ms           12  45.2203ns
-BM_full_introsort/262144                   124 ms          124 ms            6   47.261ns
-BM_full_introsort/524288                   268 ms          268 ms            3  51.0427ns
-BM_full_introsort/1048576                  557 ms          557 ms            1  53.0751ns
-```
-
-#### VxSort No Unroll, Bitonic Sort 64-elements
-
-```bash
------------------------------------------------------------------------------------------
-Benchmark                                    Time             CPU   Iterations     Time/N
------------------------------------------------------------------------------------------
-BM_vxsort<int64_t, 1>/4096          0.486 ms        0.485 ms         1457  11.8505ns
-BM_vxsort<int64_t, 1>/8192           1.29 ms         1.29 ms          561  15.7416ns
-BM_vxsort<int64_t, 1>/16384          2.76 ms         2.75 ms          253  16.8082ns
-BM_vxsort<int64_t, 1>/32768          6.03 ms         6.03 ms          116  18.3878ns
-BM_vxsort<int64_t, 1>/65536          13.1 ms         13.1 ms           53   19.927ns
-BM_vxsort<int64_t, 1>/131072         27.7 ms         27.7 ms           25  21.1131ns
-BM_vxsort<int64_t, 1>/262144         60.1 ms         60.1 ms           12  22.9112ns
-BM_vxsort<int64_t, 1>/524288          127 ms          126 ms            5  24.1048ns
-BM_vxsort<int64_t, 1>/1048576         269 ms          269 ms            3  25.6178ns
-```
-
-#### VxSort Unroll x 4, Bitonic Sort 64-elements
-
-```bash
------------------------------------------------------------------------------------------
-Benchmark                                    Time             CPU   Iterations     Time/N
------------------------------------------------------------------------------------------
-BM_vxsort<int64_t, 4>/4096          0.279 ms        0.279 ms         2462  6.79957ns
-BM_vxsort<int64_t, 4>/8192          0.673 ms        0.672 ms         1000  8.20411ns
-BM_vxsort<int64_t, 4>/16384          1.52 ms         1.52 ms          455  9.25887ns
-BM_vxsort<int64_t, 4>/32768          3.37 ms         3.36 ms          210  10.2602ns
-BM_vxsort<int64_t, 4>/65536          7.20 ms         7.20 ms           96   10.982ns
-BM_vxsort<int64_t, 4>/131072         15.1 ms         15.1 ms           46  11.4838ns
-BM_vxsort<int64_t, 4>/262144         32.5 ms         32.5 ms           21  12.3887ns
-BM_vxsort<int64_t, 4>/524288         67.4 ms         67.3 ms           10  12.8354ns
-BM_vxsort<int64_t, 4>/1048576         144 ms          144 ms            5   13.689ns
-```
 
-#### VxSort Unroll x 8, Bitonic Sort 64-elements
-
-```bash
------------------------------------------------------------------------------------------
-Benchmark                                    Time             CPU   Iterations     Time/N
------------------------------------------------------------------------------------------
-BM_vxsort<int64_t, 8>/4096          0.271 ms        0.271 ms         2601  6.61364ns
-BM_vxsort<int64_t, 8>/8192          0.603 ms        0.603 ms         1190  7.35612ns
-BM_vxsort<int64_t, 8>/16384          1.35 ms         1.35 ms          517  8.23185ns
-BM_vxsort<int64_t, 8>/32768          2.96 ms         2.96 ms          232  9.02835ns
-BM_vxsort<int64_t, 8>/65536          6.32 ms         6.31 ms          111    9.634ns
-BM_vxsort<int64_t, 8>/131072         13.3 ms         13.3 ms           52  10.1321ns
-BM_vxsort<int64_t, 8>/262144         27.8 ms         27.8 ms           25    10.61ns
-BM_vxsort<int64_t, 8>/524288         59.7 ms         59.6 ms           12  11.3669ns
-BM_vxsort<int64_t, 8>/1048576         121 ms          121 ms            6  11.5373ns
-```
-
-#### VxSort Unroll x 12, Bitonic Sort 64-elements
-
-```bash
------------------------------------------------------------------------------------------
-Benchmark                                    Time             CPU   Iterations     Time/N
------------------------------------------------------------------------------------------
-BM_vxsort<int64_t, 12>/4096         0.275 ms        0.275 ms         2504  6.70556ns
-BM_vxsort<int64_t, 12>/8192         0.593 ms        0.593 ms         1140  7.23399ns
-BM_vxsort<int64_t, 12>/16384         1.38 ms         1.37 ms          496  8.38849ns
-BM_vxsort<int64_t, 12>/32768         2.95 ms         2.95 ms          235   8.9886ns
-BM_vxsort<int64_t, 12>/65536         6.39 ms         6.38 ms          111  9.73922ns
-BM_vxsort<int64_t, 12>/131072        13.2 ms         13.2 ms           53  10.0404ns
-BM_vxsort<int64_t, 12>/262144        28.4 ms         28.4 ms           25   10.833ns
-BM_vxsort<int64_t, 12>/524288        58.9 ms         58.8 ms           12  11.2206ns
-BM_vxsort<int64_t, 12>/1048576        125 ms          124 ms            6  11.8665ns
-```
-
-### int32 
-
-#### VxSort No Unroll, Bitonic Sort 128-elements
-
-```
-```bash
-----------------------------------------------------------------------------------------
-Benchmark                                   Time             CPU   Iterations     Time/N
-----------------------------------------------------------------------------------------
-BM_vxsort<int32_t, 1>/4096         0.169 ms        0.169 ms         4261  4.11785ns
-BM_vxsort<int32_t, 1>/8192         0.459 ms        0.459 ms         1516  5.60347ns
-BM_vxsort<int32_t, 1>/16384         1.19 ms         1.19 ms          596  7.27952ns
-BM_vxsort<int32_t, 1>/32768         2.61 ms         2.61 ms          269  7.96259ns
-BM_vxsort<int32_t, 1>/65536         5.70 ms         5.69 ms          120  8.68616ns
-BM_vxsort<int32_t, 1>/131072        12.6 ms         12.6 ms           57  9.62647ns
-BM_vxsort<int32_t, 1>/262144        26.4 ms         26.4 ms           26  10.0556ns
-BM_vxsort<int32_t, 1>/524288        56.9 ms         56.8 ms           12  10.8417ns
-BM_vxsort<int32_t, 1>/1048576        120 ms          120 ms            6   11.407ns
-```
-
-#### VxSort Unroll x 4, Bitonic Sort 128-elements
-
-```bash
-----------------------------------------------------------------------------------------
-Benchmark                                   Time             CPU   Iterations     Time/N
-----------------------------------------------------------------------------------------
-BM_vxsort<int32_t, 4>/4096         0.135 ms        0.135 ms         4836  3.29119ns
-BM_vxsort<int32_t, 4>/8192         0.291 ms        0.291 ms         2372  3.55463ns
-BM_vxsort<int32_t, 4>/16384        0.657 ms        0.656 ms         1061  4.00521ns
-BM_vxsort<int32_t, 4>/32768         1.57 ms         1.57 ms          462  4.79389ns
-BM_vxsort<int32_t, 4>/65536         3.35 ms         3.34 ms          205  5.10211ns
-BM_vxsort<int32_t, 4>/131072        7.20 ms         7.19 ms           98  5.48511ns
-BM_vxsort<int32_t, 4>/262144        15.4 ms         15.4 ms           45  5.87159ns
-BM_vxsort<int32_t, 4>/524288        34.0 ms         34.0 ms           22  6.48067ns
-BM_vxsort<int32_t, 4>/1048576       68.6 ms         68.5 ms           10  6.53424ns
-```
-
-#### VxSort Unroll x 8, Bitonic Sort 128-elements
-
-```bash
-----------------------------------------------------------------------------------------
-Benchmark                                   Time             CPU   Iterations     Time/N
-----------------------------------------------------------------------------------------
-BM_vxsort<int32_t, 8>/4096         0.132 ms        0.132 ms         5341  3.21375ns
-BM_vxsort<int32_t, 8>/8192         0.292 ms        0.292 ms         2495  3.56416ns
-BM_vxsort<int32_t, 8>/16384        0.631 ms        0.631 ms         1145  3.84954ns
-BM_vxsort<int32_t, 8>/32768         1.43 ms         1.43 ms          524  4.35009ns
-BM_vxsort<int32_t, 8>/65536         3.21 ms         3.21 ms          232  4.89271ns
-BM_vxsort<int32_t, 8>/131072        6.41 ms         6.40 ms          108  4.88355ns
-BM_vxsort<int32_t, 8>/262144        13.8 ms         13.8 ms           51  5.26214ns
-BM_vxsort<int32_t, 8>/524288        29.1 ms         29.0 ms           24  5.53438ns
-BM_vxsort<int32_t, 8>/1048576       59.8 ms         59.7 ms           11  5.69466ns
-```
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
@@ -11,6 +11,8 @@ target_link_libraries(${TARGET_NAME}
         ${CMAKE_PROJECT_NAME}_lib
         benchmark
         picosha2
+        fmt::fmt
+        magic_enum::magic_enum
         ${CMAKE_THREAD_LIBS_INIT})
 
 configure_file(run.sh run.sh COPYONLY)

diff --git a/bench/bench.cpp b/bench/bench.cpp
@@ -1,9 +1,28 @@
 #include "benchmark/benchmark.h"
 
+namespace vxsort_bench {
+
+void register_fullsort_avx2_i_benchmarks();
+void register_fullsort_avx2_u_benchmarks();
+void register_fullsort_avx2_f_benchmarks();
+void register_fullsort_avx512_i_benchmarks();
+void register_fullsort_avx512_u_benchmarks();
+void register_fullsort_avx512_f_benchmarks();
+
+void register_benchmarks() {
+    register_fullsort_avx2_i_benchmarks();
+    register_fullsort_avx2_u_benchmarks();
+    register_fullsort_avx2_f_benchmarks();
+    register_fullsort_avx512_i_benchmarks();
+    register_fullsort_avx512_u_benchmarks();
+    register_fullsort_avx512_f_benchmarks();
+}
+}  // namespace vxsort_bench
+
 using namespace std;
 
-int main(int argc, char** argv)
-{
-  ::benchmark::Initialize(&argc, argv);
-  ::benchmark::RunSpecifiedBenchmarks();
-}
+int main(int argc, char** argv) {
+    vxsort_bench::register_benchmarks();
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/bench/fullsort/BM_fullsort.pdqsort.cpp b/bench/fullsort/BM_fullsort.pdqsort.cpp
@@ -13,10 +13,9 @@ using namespace vxsort::types;
 template <class Q>
 static void BM_pdqsort_branchless(benchmark::State& state) {
     auto n = state.range(0);
-    auto v = std::vector<Q>((i32)n);
     const auto ITERATIONS = 10;
 
-    generate_unique_values_vec(v, (Q)0x1000, (Q)8);
+    auto v = unique_values(n, (Q) 0x1000, (Q) 8);
     auto copies = generate_copies(ITERATIONS, n, v);
     auto begins = generate_array_beginnings(copies);
     auto ends = generate_array_beginnings(copies);

diff --git a/bench/fullsort/BM_fullsort.stdsort.cpp b/bench/fullsort/BM_fullsort.stdsort.cpp
@@ -13,10 +13,9 @@ using namespace vxsort::types;
 template <class Q>
 static void BM_stdsort(benchmark::State& state) {
     auto n = state.range(0);
-    auto v = std::vector<Q>((i32)n);
     const auto ITERATIONS = 10;
 
-    generate_unique_values_vec(v, (Q)0x1000, (Q)8);
+    auto v = unique_values(n, (Q) 0x1000, (Q) 8);
     auto copies = generate_copies(ITERATIONS, n, v);
     auto begins = generate_array_beginnings(copies);
     auto ends = generate_array_beginnings(copies);

diff --git a/bench/fullsort/BM_fullsort.vxsort.avx2.f.cpp b/bench/fullsort/BM_fullsort.vxsort.avx2.f.cpp
@@ -1,27 +1,20 @@
-    #include "vxsort_targets_enable_avx2.h"
+#include "vxsort_targets_enable_avx2.h"
 
-#include <random>
 #include <benchmark/benchmark.h>
+#include <random>
 
 #include <vxsort.avx2.h>
 
 #include "BM_fullsort.vxsort.h"
 
 namespace vxsort_bench {
 using namespace vxsort::types;
-using benchmark::TimeUnit;
 using vm = vxsort::vector_machine;
 
-BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-
-BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
-
+void register_fullsort_avx2_f_benchmarks() {
+    register_fullsort_benchmarks<vm::AVX2, 8, f32, f64>();
 }
 
+}  // namespace vxsort_bench
+
 #include "vxsort_targets_disable.h"