Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ run-regressiontests:
- init-repo
variables:
#SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 0:30:00 -N 1 -n 1 --cpus-per-task=2 --mem-per-cpu=2G"
SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 0:30:00"
SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 10:00:00"
extends: .load-modules
script:
- export LD_LIBRARY_PATH="$(pwd)/build/libs:$LD_LIBRARY_PATH"
Expand All @@ -202,7 +202,7 @@ run-regressiontests:

- cd build/benchmarks
- cp ../../example_networks/jet-structure-classification-with-host-mem/* .
- ../_deps/googlebenchmark-src/tools/compare.py -a -d results.json benchmarks ../../benchmarks/expectedPerformance.json ./SynchronousInferenceBenchmark | tee benchmark_output.txt
- ../_deps/googlebenchmark-src/tools/compare.py -a -d results.json benchmarks ../../benchmarks/expectedPerformance.json ./RegressionTest | tee benchmark_output.txt
# Extract the OVERALL_GEOMEAN p-value and check if it exceeds the threshold
- THRESHOLD=0.05
- GEOMEAN=$(grep "OVERALL_GEOMEAN" benchmark_output.txt)
Expand Down
155 changes: 155 additions & 0 deletions benchmarks/AsynchronousInferenceBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/**
* @file AsynchronousInferenceBenchmark.cpp
* @author Linus Jungemann (linus.jungemann@uni-paderborn.de)
* @brief Benchmarks the SynchronousInference Performance of the Driver
* @version 0.1
* @date 2025-03-21
*
* @copyright Copyright (c) 2025
* @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
*
*/

#include <benchmark/benchmark.h>

#include <FINNCppDriver/core/BaseDriver.hpp>
#include <FINNCppDriver/utils/FinnDatatypes.hpp>
#include <FINNCppDriver/utils/Logger.hpp>
#include <algorithm>
#include <atomic>
#include <condition_variable>
#include <cstdint>
#include <mutex>
#include <queue>
#include <random>
#include <thread>
#include <vector>

template<typename O>
using destribution_t = typename std::conditional_t<std::is_same_v<O, float>, std::uniform_real_distribution<O>, std::uniform_int_distribution<O>>;

using InputFinnType = Finn::DatatypeInt<8>;
using OutputFinnType = Finn::DatatypeInt<16>;

namespace Finn {
template<bool SynchronousInference>
using Driver = Finn::BaseDriver<SynchronousInference, InputFinnType, OutputFinnType>;
} // namespace Finn

template<bool SynchronousInference>
Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
}

static void BM_AsynchronousInferenceSingleThread(benchmark::State& state) {
const std::string exampleNetworkConfig = "jetConfig.json";
const uint batchSize = static_cast<uint>(state.range(0));
std::cout << "Running single-threaded benchmark with batch size: " << batchSize << std::endl;
auto driver = createDriverFromConfig<false>(exampleNetworkConfig, batchSize);
using dtype = int8_t;

// Create buffers for pipelining
std::vector<dtype> inputBuffer(24 * batchSize);

std::random_device rndDevice;
std::mt19937 mersenneEngine{rndDevice()};
destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};

// Fill all buffers with random data
std::generate(inputBuffer.begin(), inputBuffer.end(), [&dist, &mersenneEngine]() { return dist(mersenneEngine); });

// Warmup
driver.input(inputBuffer.begin(), inputBuffer.end());
auto warmup = driver.getResults();
benchmark::DoNotOptimize(warmup);
std::chrono::duration<float> runtime = std::chrono::seconds(90); // Fixed runtime for the benchmark

for (auto _ : state) {
size_t processedCount = 0;

// Set a fixed time for the benchmark
const auto start = std::chrono::high_resolution_clock::now();

while (std::chrono::high_resolution_clock::now() - start < std::chrono::duration<float>(runtime)) {
// Submit as many inputs as we have available buffers
driver.input(inputBuffer.begin(), inputBuffer.end());

// Retrieve results (this makes it single-threaded - we wait for results)
auto results = driver.getResults();
benchmark::DoNotOptimize(results);
++processedCount;
}
std::size_t infered = processedCount * batchSize;

// Report items processed in this iteration
state.SetItemsProcessed(static_cast<int64_t>(infered));
}
}

// Register the function as a benchmark
BENCHMARK(BM_AsynchronousInferenceSingleThread)->RangeMultiplier(2)->Range(1, 4096)->Repetitions(5);

static void BM_AsynchronousInferenceMultiThread(benchmark::State& state) {
const std::string exampleNetworkConfig = "jetConfig.json";
const uint batchSize = static_cast<uint>(state.range(0));
std::cout << "Running multi-threaded benchmark with batch size: " << batchSize << std::endl;
auto driver = createDriverFromConfig<false>(exampleNetworkConfig, batchSize);
using dtype = int8_t;

// Create buffers for pipelining
std::vector<dtype> inputBuffer(24 * batchSize);

std::random_device rndDevice;
std::mt19937 mersenneEngine{rndDevice()};
destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};

// Fill all buffers with random data
std::generate(inputBuffer.begin(), inputBuffer.end(), [&dist, &mersenneEngine]() { return dist(mersenneEngine); });

// Warmup
driver.input(inputBuffer.begin(), inputBuffer.end());
auto warmup = driver.getResults();
benchmark::DoNotOptimize(warmup);
std::chrono::duration<float> runtime = std::chrono::seconds(90); // Fixed runtime for the benchmark

for (auto _ : state) {
std::atomic<std::size_t> processedCount = 0;

// Start input thread that continuously submits new inputs
std::jthread inputThread([&](std::stop_token stoken) {
// Set a fixed time for the benchmark
while (!stoken.stop_requested()) {
driver.input(inputBuffer.begin(), inputBuffer.end());
}
});

// Start output thread that retrieves results
std::jthread outputThread([&](std::stop_token stoken) {
// Set a fixed time for the benchmark
while (!stoken.stop_requested()) {
auto results = driver.getResults();
benchmark::DoNotOptimize(results);
++processedCount;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Make sure input thread is already exited
driver.drain(); // Drain any remaining results; might need to be accounted for in runtime for inf/s calculation
});

const auto start = std::chrono::high_resolution_clock::now();
while (std::chrono::high_resolution_clock::now() - start < std::chrono::duration<float>(runtime)) {} // Looks stupid, but is for some reason more reliable...
inputThread.request_stop(); // Stop input thread
outputThread.request_stop(); // Stop output thread

inputThread.join();
outputThread.join();
std::size_t infered = processedCount * batchSize;

// Report items processed in this iteration
state.SetItemsProcessed(static_cast<int64_t>(infered));
}
}

// Register the multi-threaded benchmark
BENCHMARK(BM_AsynchronousInferenceMultiThread)->RangeMultiplier(2)->Range(1, 4096)->Repetitions(5);

BENCHMARK_MAIN();
4 changes: 3 additions & 1 deletion benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ set(FINN_BENCHMARK_DIR ${CMAKE_CURRENT_BINARY_DIR})
add_benchmark(DataPackingBenchmark.cpp)
add_benchmark(DynamicMdSpanBenchmark.cpp)
add_benchmark(SynchronousInferenceBenchmark.cpp)
add_benchmark(SPSCQueueBenchmark.cpp)
add_benchmark(SPSCQueueBenchmark.cpp)
add_benchmark(AsynchronousInferenceBenchmark.cpp)
add_benchmark(RegressionTest.cpp)
67 changes: 67 additions & 0 deletions benchmarks/RegressionTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* @file RegressionTest.cpp
* @author Linus Jungemann (linus.jungemann@uni-paderborn.de)
* @brief Benchmarks the SynchronousInference Performance of the Driver
* @version 0.2
* @date 2025-03-21
*
* @copyright Copyright (c) 2025
* @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
*
*/

#include <benchmark/benchmark.h>

#include <FINNCppDriver/core/BaseDriver.hpp>
#include <FINNCppDriver/utils/FinnDatatypes.hpp>
#include <algorithm>
#include <cstdint>
#include <random>
#include <vector>

template<typename O>
using destribution_t = typename std::conditional_t<std::is_same_v<O, float>, std::uniform_real_distribution<O>, std::uniform_int_distribution<O>>;

using InputFinnType = Finn::DatatypeInt<8>;
using OutputFinnType = Finn::DatatypeInt<16>;

namespace Finn {
template<bool SynchronousInference>
using Driver = Finn::BaseDriver<SynchronousInference, InputFinnType, OutputFinnType>;
} // namespace Finn

template<bool SynchronousInference>
Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
}

static void BM_SynchronousInference(benchmark::State& state) {
const std::string exampleNetworkConfig = "jetConfig.json";
const uint batchSize = static_cast<uint>(state.range(0));
auto driver = createDriverFromConfig<true>(exampleNetworkConfig, batchSize);
using dtype = int8_t;
Finn::vector<dtype> testInputs(24 * batchSize);

std::random_device rndDevice;
std::mt19937 mersenneEngine{rndDevice()}; // Generates random integers

destribution_t<dtype> dist{static_cast<dtype>(InputFinnType().min()), static_cast<dtype>(InputFinnType().max())};

auto gen = [&dist, &mersenneEngine]() { return dist(mersenneEngine); };

// Warmup
std::fill(testInputs.begin(), testInputs.end(), 1);
auto warmup = driver.inferSynchronous(testInputs.begin(), testInputs.end());
benchmark::DoNotOptimize(warmup);

std::generate(testInputs.begin(), testInputs.end(), gen);
for (auto _ : state) {
auto ret = driver.inferSynchronous(testInputs.begin(), testInputs.end());
benchmark::DoNotOptimize(ret);
benchmark::ClobberMemory();
}
}
// Register the function as a benchmark
BENCHMARK(BM_SynchronousInference)->Iterations(1000000)->RangeMultiplier(2)->Range(1, 4 << 10)->Repetitions(10);

BENCHMARK_MAIN();
Loading