eki-project · LinusJungemann · Jul 17, 2025 · Jul 2, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -187,7 +187,7 @@ run-regressiontests:
     - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 0:30:00 -N 1 -n 1 --cpus-per-task=2 --mem-per-cpu=2G"
-    SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 0:30:00"
+    SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 10:00:00"
   extends: .load-modules
   script:
     - export LD_LIBRARY_PATH="$(pwd)/build/libs:$LD_LIBRARY_PATH"
@@ -202,7 +202,7 @@ run-regressiontests:
 
     - cd build/benchmarks
     - cp ../../example_networks/jet-structure-classification-with-host-mem/* .
-    - ../_deps/googlebenchmark-src/tools/compare.py -a -d results.json benchmarks ../../benchmarks/expectedPerformance.json ./SynchronousInferenceBenchmark | tee benchmark_output.txt
+    - ../_deps/googlebenchmark-src/tools/compare.py -a -d results.json benchmarks ../../benchmarks/expectedPerformance.json ./RegressionTest | tee benchmark_output.txt
     # Extract the OVERALL_GEOMEAN p-value and check if it exceeds the threshold
     - THRESHOLD=0.05
     - GEOMEAN=$(grep "OVERALL_GEOMEAN" benchmark_output.txt)

diff --git a/benchmarks/AsynchronousInferenceBenchmark.cpp b/benchmarks/AsynchronousInferenceBenchmark.cpp
@@ -0,0 +1,155 @@
+/**
+ * @file AsynchronousInferenceBenchmark.cpp
+ * @author Linus Jungemann (linus.jungemann@uni-paderborn.de)
+ * @brief Benchmarks the SynchronousInference Performance of the Driver
+ * @version 0.1
+ * @date 2025-03-21
+ *
+ * @copyright Copyright (c) 2025
+ * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
+ *
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <FINNCppDriver/core/BaseDriver.hpp>
+#include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+#include <vector>
+
+template<typename O>
+using destribution_t = typename std::conditional_t<std::is_same_v<O, float>, std::uniform_real_distribution<O>, std::uniform_int_distribution<O>>;
+
+using InputFinnType = Finn::DatatypeInt<8>;
+using OutputFinnType = Finn::DatatypeInt<16>;
+
+namespace Finn {
+    template<bool SynchronousInference>
+    using Driver = Finn::BaseDriver<SynchronousInference, InputFinnType, OutputFinnType>;
+}  // namespace Finn
+
+template<bool SynchronousInference>
+Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
+    return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
+}
+
+static void BM_AsynchronousInferenceSingleThread(benchmark::State& state) {
+    const std::string exampleNetworkConfig = "jetConfig.json";
+    const uint batchSize = static_cast<uint>(state.range(0));
+    std::cout << "Running single-threaded benchmark with batch size: " << batchSize << std::endl;
+    auto driver = createDriverFromConfig<false>(exampleNetworkConfig, batchSize);
+    using dtype = int8_t;
+
+    // Create buffers for pipelining
+    std::vector<dtype> inputBuffer(24 * batchSize);
+
+    std::random_device rndDevice;
+    std::mt19937 mersenneEngine{rndDevice()};
+    destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};
+
+    // Fill all buffers with random data
+    std::generate(inputBuffer.begin(), inputBuffer.end(), [&dist, &mersenneEngine]() { return dist(mersenneEngine); });
+
+    // Warmup
+    driver.input(inputBuffer.begin(), inputBuffer.end());
+    auto warmup = driver.getResults();
+    benchmark::DoNotOptimize(warmup);
+    std::chrono::duration<float> runtime = std::chrono::seconds(90);  // Fixed runtime for the benchmark
+
+    for (auto _ : state) {
+        size_t processedCount = 0;
+
+        // Set a fixed time for the benchmark
+        const auto start = std::chrono::high_resolution_clock::now();
+
+        while (std::chrono::high_resolution_clock::now() - start < std::chrono::duration<float>(runtime)) {
+            // Submit as many inputs as we have available buffers
+            driver.input(inputBuffer.begin(), inputBuffer.end());
+
+            // Retrieve results (this makes it single-threaded - we wait for results)
+            auto results = driver.getResults();
+            benchmark::DoNotOptimize(results);
+            ++processedCount;
+        }
+        std::size_t infered = processedCount * batchSize;
+
+        // Report items processed in this iteration
+        state.SetItemsProcessed(static_cast<int64_t>(infered));
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_AsynchronousInferenceSingleThread)->RangeMultiplier(2)->Range(1, 4096)->Repetitions(5);
+
+static void BM_AsynchronousInferenceMultiThread(benchmark::State& state) {
+    const std::string exampleNetworkConfig = "jetConfig.json";
+    const uint batchSize = static_cast<uint>(state.range(0));
+    std::cout << "Running multi-threaded benchmark with batch size: " << batchSize << std::endl;
+    auto driver = createDriverFromConfig<false>(exampleNetworkConfig, batchSize);
+    using dtype = int8_t;
+
+    // Create buffers for pipelining
+    std::vector<dtype> inputBuffer(24 * batchSize);
+
+    std::random_device rndDevice;
+    std::mt19937 mersenneEngine{rndDevice()};
+    destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};
+
+    // Fill all buffers with random data
+    std::generate(inputBuffer.begin(), inputBuffer.end(), [&dist, &mersenneEngine]() { return dist(mersenneEngine); });
+
+    // Warmup
+    driver.input(inputBuffer.begin(), inputBuffer.end());
+    auto warmup = driver.getResults();
+    benchmark::DoNotOptimize(warmup);
+    std::chrono::duration<float> runtime = std::chrono::seconds(90);  // Fixed runtime for the benchmark
+
+    for (auto _ : state) {
+        std::atomic<std::size_t> processedCount = 0;
+
+        // Start input thread that continuously submits new inputs
+        std::jthread inputThread([&](std::stop_token stoken) {
+            // Set a fixed time for the benchmark
+            while (!stoken.stop_requested()) {
+                driver.input(inputBuffer.begin(), inputBuffer.end());
+            }
+        });
+
+        // Start output thread that retrieves results
+        std::jthread outputThread([&](std::stop_token stoken) {
+            // Set a fixed time for the benchmark
+            while (!stoken.stop_requested()) {
+                auto results = driver.getResults();
+                benchmark::DoNotOptimize(results);
+                ++processedCount;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));  // Make sure input thread is already exited
+            driver.drain();                                               // Drain any remaining results; might need to be accounted for in runtime for inf/s calculation
+        });
+
+        const auto start = std::chrono::high_resolution_clock::now();
+        while (std::chrono::high_resolution_clock::now() - start < std::chrono::duration<float>(runtime)) {}  // Looks stupid, but is for some reason more reliable...
+        inputThread.request_stop();                                                                           // Stop input thread
+        outputThread.request_stop();                                                                          // Stop output thread
+
+        inputThread.join();
+        outputThread.join();
+        std::size_t infered = processedCount * batchSize;
+
+        // Report items processed in this iteration
+        state.SetItemsProcessed(static_cast<int64_t>(infered));
+    }
+}
+
+// Register the multi-threaded benchmark
+BENCHMARK(BM_AsynchronousInferenceMultiThread)->RangeMultiplier(2)->Range(1, 4096)->Repetitions(5);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -6,4 +6,6 @@ set(FINN_BENCHMARK_DIR ${CMAKE_CURRENT_BINARY_DIR})
 add_benchmark(DataPackingBenchmark.cpp)
 add_benchmark(DynamicMdSpanBenchmark.cpp)
 add_benchmark(SynchronousInferenceBenchmark.cpp)
-add_benchmark(SPSCQueueBenchmark.cpp)
+add_benchmark(SPSCQueueBenchmark.cpp)
+add_benchmark(AsynchronousInferenceBenchmark.cpp)
+add_benchmark(RegressionTest.cpp)
diff --git a/benchmarks/RegressionTest.cpp b/benchmarks/RegressionTest.cpp
@@ -0,0 +1,67 @@
+/**
+ * @file RegressionTest.cpp
+ * @author Linus Jungemann (linus.jungemann@uni-paderborn.de)
+ * @brief Benchmarks the SynchronousInference Performance of the Driver
+ * @version 0.2
+ * @date 2025-03-21
+ *
+ * @copyright Copyright (c) 2025
+ * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
+ *
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <FINNCppDriver/core/BaseDriver.hpp>
+#include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+template<typename O>
+using destribution_t = typename std::conditional_t<std::is_same_v<O, float>, std::uniform_real_distribution<O>, std::uniform_int_distribution<O>>;
+
+using InputFinnType = Finn::DatatypeInt<8>;
+using OutputFinnType = Finn::DatatypeInt<16>;
+
+namespace Finn {
+    template<bool SynchronousInference>
+    using Driver = Finn::BaseDriver<SynchronousInference, InputFinnType, OutputFinnType>;
+}  // namespace Finn
+
+template<bool SynchronousInference>
+Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
+    return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
+}
+
+static void BM_SynchronousInference(benchmark::State& state) {
+    const std::string exampleNetworkConfig = "jetConfig.json";
+    const uint batchSize = static_cast<uint>(state.range(0));
+    auto driver = createDriverFromConfig<true>(exampleNetworkConfig, batchSize);
+    using dtype = int8_t;
+    Finn::vector<dtype> testInputs(24 * batchSize);
+
+    std::random_device rndDevice;
+    std::mt19937 mersenneEngine{rndDevice()};  // Generates random integers
+
+    destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};
+
+    auto gen = [&dist, &mersenneEngine]() { return dist(mersenneEngine); };
+
+    // Warmup
+    std::fill(testInputs.begin(), testInputs.end(), 1);
+    auto warmup = driver.inferSynchronous(testInputs.begin(), testInputs.end());
+    benchmark::DoNotOptimize(warmup);
+
+    std::generate(testInputs.begin(), testInputs.end(), gen);
+    for (auto _ : state) {
+        auto ret = driver.inferSynchronous(testInputs.begin(), testInputs.end());
+        benchmark::DoNotOptimize(ret);
+        benchmark::ClobberMemory();
+    }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SynchronousInference)->Iterations(1000000)->RangeMultiplier(2)->Range(1, 4 << 10)->Repetitions(10);
+
+BENCHMARK_MAIN();