diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d3c4339..f5f7884 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -16,19 +16,19 @@ workflow:
 
 .load-modules:
   before_script:
-    - ml vis/Graphviz/8.1.0-GCCcore-12.3.0 fpga xilinx/xrt/2.16 devel/Doxygen/1.9.7-GCCcore-12.3.0 compiler/GCC/12.3.0 devel/CMake/3.26.3-GCCcore-12.3.0 devel/Boost/1.82.0-GCC-12.3.0
+    - ml vis/Graphviz/8.1.0-GCCcore-12.3.0 fpga xilinx/xrt/2.16 devel/Doxygen/1.9.8-GCCcore-13.2.0 compiler/GCC/13.3.0 devel/CMake/3.29.3-GCCcore-13.3.0
     
-build-dependencies:
+init-repo:
   id_tokens:
     CI_JOB_JWT:
       aud: https://git.uni-paderborn.de
   stage: prepare
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 2:00:00 -N 1 -n 1 --cpus-per-task=128"
-    SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 2:00:00"
+    SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 0:05:00"
   extends: .load-modules
   script:
-    - ./buildDependencies.sh
+    - git submodule update --init --recursive
   artifacts:
     paths:
       - "deps/"
@@ -40,7 +40,7 @@ build-unittests:
       aud: https://git.uni-paderborn.de
   stage: build
   needs:
-    - build-dependencies
+    - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 2:00:00 -N 1 -n 1 --cpus-per-task=128"
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 2:00:00"
@@ -60,7 +60,7 @@ build-integrationtests:
       aud: https://git.uni-paderborn.de
   stage: build
   needs:
-    - build-dependencies
+    - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 2:00:00 -N 1 -n 1 --cpus-per-task=128"
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 2:00:00"
@@ -85,7 +85,7 @@ build-regressiontests:
       aud: https://git.uni-paderborn.de
   stage: build
   needs:
-    - build-dependencies
+    - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 2:00:00 -N 1 -n 1 --cpus-per-task=128"
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 2:00:00"
@@ -122,13 +122,13 @@ run-unittests:
   stage: test
   needs:
     - build-unittests
-    - build-dependencies
+    - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 0:30:00 -N 1 -n 1 --cpus-per-task=2 --mem-per-cpu=2G"
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 2:00:00"
   extends: .load-modules
   script:
-    - export LD_LIBRARY_PATH="$(pwd)/deps/finn_boost/stage/lib/boost/:$LD_LIBRARY_PATH"
+    - export LD_LIBRARY_PATH="$(pwd)/build/libs:$LD_LIBRARY_PATH"
     - cd build
     - ctest --output-on-failure --output-junit ctest-results.xml
   artifacts:
@@ -146,12 +146,12 @@ run-integrationtests:
   stage: test
   needs:
     - build-integrationtests
-    - build-dependencies
+    - init-repo
   variables:
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc  -t 0:30:00"
   extends: .load-modules
   script:
-    - export LD_LIBRARY_PATH="$(pwd)/deps/finn_boost/stage/lib/boost/:$LD_LIBRARY_PATH"
+    - export LD_LIBRARY_PATH="$(pwd)/build/libs:$LD_LIBRARY_PATH"
     # Reset FPGAs and setup Host Memory Access
     - ml xilinx/u55c/u55c_gen3x16_xdma_3_202210_1
     - xbutil reset -d 0000:c1:00.1
@@ -184,13 +184,13 @@ run-regressiontests:
   stage: test
   needs:
     - build-regressiontests
-    - build-dependencies
+    - init-repo
   variables:
     #SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p normal -t 0:30:00 -N 1 -n 1 --cpus-per-task=2 --mem-per-cpu=2G"
     SCHEDULER_PARAMETERS: "-A hpc-prf-ekiapp -p hacc -t 0:30:00"
   extends: .load-modules
   script:
-    - export LD_LIBRARY_PATH="$(pwd)/deps/finn_boost/stage/lib/boost/:$LD_LIBRARY_PATH"
+    - export LD_LIBRARY_PATH="$(pwd)/build/libs:$LD_LIBRARY_PATH"
     # Reset FPGAs and setup Host Memory Access
     - ml xilinx/u55c/u55c_gen3x16_xdma_3_202210_1
     - xbutil reset -d 0000:c1:00.1
diff --git a/.gitmodules b/.gitmodules
index 1e4a39c..4b433e2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,9 @@
 [submodule "external/xsimd"]
 	path = external/xsimd
 	url = https://github.com/xtensor-stack/xsimd.git
+[submodule "external/popl"]
+	path = external/popl
+	url = https://github.com/badaix/popl.git
+[submodule "external/plog"]
+	path = external/plog
+	url = https://github.com/SergiusTheBest/plog.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f442ce..b12f54c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,14 +72,6 @@ endif()
 
 #INCLUDES
 
-#Set Boost policies for compatibility
-if(POLICY CMP0167)
-  cmake_policy(SET CMP0167 OLD)
-endif()
-if(POLICY CMP0144)
-  cmake_policy(SET CMP0144 NEW)
-endif()
-
 #Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
@@ -92,8 +84,11 @@ find_package(XRT REQUIRED)
 #OpenMP
 find_package(OpenMP REQUIRED)
 
-#Boost
-find_package(Boost 1.79.0 COMPONENTS system log log_setup program_options filesystem ${BOOST_THREAD} REQUIRED)
+#popl
+include_directories(SYSTEM external/popl/include)
+
+#plog
+include_directories(SYSTEM external/plog/include)
 
 #xsimd
 add_subdirectory(external/xsimd)
diff --git a/README.md b/README.md
index f7e4b44..4e1ee00 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ It is assumed, that you used FINN and now want to build the generated driver. Co
 Building the driver is as easy as running:
 
 ```bash
-./buildDependencies.sh
+git submodule update --init --recursive
 mkdir build && cd build
 cmake -DCMAKE_BUILD_TYPE=Release -DFINN_ENABLE_SANITIZERS=OFF -DFINN_HEADER_LOCATION=../AcceleratorDatatypes.h -DFINN_USE_HOST_MEM=OFF ..
 make -j $(nprocs)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 2a7ef2e..e1fdde4 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -5,4 +5,5 @@ set(FINN_BENCHMARK_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 add_benchmark(DataPackingBenchmark.cpp)
 add_benchmark(DynamicMdSpanBenchmark.cpp)
-add_benchmark(SynchronousInferenceBenchmark.cpp)
\ No newline at end of file
+add_benchmark(SynchronousInferenceBenchmark.cpp)
+add_benchmark(SPSCQueueBenchmark.cpp)
\ No newline at end of file
diff --git a/benchmarks/SPSCQueueBenchmark.cpp b/benchmarks/SPSCQueueBenchmark.cpp
new file mode 100644
index 0000000..a881f56
--- /dev/null
+++ b/benchmarks/SPSCQueueBenchmark.cpp
@@ -0,0 +1,528 @@
+#include <benchmark/benchmark.h>
+
+#include <FINNCppDriver/utils/SPSCQueue.hpp>
+#include <atomic>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+// Benchmark for trivial type enqueue/dequeue operations
+template<size_t QueueSize>
+static void BM_TrivialEnqueueDequeue(benchmark::State& state) {
+    SPSCQueue<int, QueueSize> queue;
+    // Ensure we don't exceed queue capacity
+    const size_t operations_per_iteration = std::min<size_t>(static_cast<size_t>(state.range(0)), queue.capacity());
+
+    for (auto _ : state) {
+        // Enqueue phase
+        size_t enqueued = 0;
+        for (size_t i = 0; i < operations_per_iteration; ++i) {
+            if (queue.try_enqueue(static_cast<int>(i))) {
+                enqueued++;
+            }
+        }
+
+        // Dequeue phase
+        int item;
+        size_t dequeued = 0;
+        for (size_t i = 0; i < enqueued; ++i) {
+            if (queue.try_dequeue(item)) {
+                dequeued++;
+            }
+        }
+
+        // Make sure we didn't lose any items
+        if (enqueued != dequeued) {
+            state.SkipWithError("Enqueue/dequeue count mismatch");
+            break;
+        }
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * operations_per_iteration * 2));  // enqueue + dequeue
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * operations_per_iteration * sizeof(int) * 2));
+}
+
+// Benchmark for non-trivial type enqueue/dequeue operations
+template<size_t QueueSize>
+static void BM_NonTrivialEnqueueDequeue(benchmark::State& state) {
+    SPSCQueue<std::string, QueueSize> queue;
+    std::string testString = "benchmark-test-string";
+    // Ensure we don't exceed queue capacity
+    const size_t operations_per_iteration = std::min<size_t>(static_cast<size_t>(state.range(0)), queue.capacity());
+
+    for (auto _ : state) {
+        // Enqueue phase
+        size_t enqueued = 0;
+        for (size_t i = 0; i < operations_per_iteration; ++i) {
+            if (queue.try_enqueue(testString)) {
+                enqueued++;
+            }
+        }
+
+        // Dequeue phase
+        std::string item;
+        size_t dequeued = 0;
+        for (size_t i = 0; i < enqueued; ++i) {
+            if (queue.try_dequeue(item)) {
+                dequeued++;
+            }
+        }
+
+        // Make sure we didn't lose any items
+        if (enqueued != dequeued) {
+            state.SkipWithError("Enqueue/dequeue count mismatch");
+            break;
+        }
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * operations_per_iteration * 2));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * operations_per_iteration * testString.size() * 2));
+}
+
+// Benchmark for multi-threaded producer-consumer pattern
+template<size_t QueueSize, typename T>
+static void BM_ProducerConsumer(benchmark::State& state) {
+    for (auto _ : state) {
+        state.PauseTiming();
+        SPSCQueue<T, QueueSize> queue;
+        std::atomic<bool> producer_done{false};
+        std::atomic<size_t> items_produced{0};
+        std::atomic<size_t> items_consumed{0};
+
+        // Initialize value based on type
+        T value;
+        if constexpr (std::is_same_v<T, int>) {
+            value = 42;  // For int type
+        } else if constexpr (std::is_same_v<T, std::string>) {
+            value = "test-string";  // For string type
+        }
+
+        // Use a smaller maximum to avoid potential deadlocks
+        const size_t num_items = std::min<size_t>(static_cast<size_t>(state.range(0)), queue.capacity() * 5);
+
+        // Start timing again before creating threads
+        state.ResumeTiming();
+
+        // Producer thread - uses non-blocking enqueue to avoid deadlocks
+        std::thread producer([&queue, &producer_done, &items_produced, &num_items, value]() {
+            while (items_produced.load(std::memory_order_relaxed) < num_items) {
+                if (queue.try_enqueue(value)) {
+                    items_produced.fetch_add(1, std::memory_order_relaxed);
+                } else {
+                    // Small yield to prevent busy waiting
+                    std::this_thread::yield();
+                }
+            }
+            producer_done.store(true, std::memory_order_release);
+        });
+
+        // Consumer thread
+        std::thread consumer([&queue, &producer_done, &items_consumed, &num_items, &items_produced]() {
+            T item;
+            while (items_consumed.load(std::memory_order_relaxed) < num_items) {
+                if (queue.try_dequeue(item)) {
+                    items_consumed.fetch_add(1, std::memory_order_relaxed);
+                } else if (producer_done.load(std::memory_order_acquire) && items_consumed.load(std::memory_order_relaxed) >= items_produced.load(std::memory_order_relaxed)) {
+                    // All items have been produced and consumed
+                    break;
+                } else {
+                    // Small yield to prevent busy waiting
+                    std::this_thread::yield();
+                }
+            }
+        });
+
+        producer.join();
+        consumer.join();
+
+        // Verify all items were processed
+        benchmark::DoNotOptimize(items_consumed.load(std::memory_order_relaxed));
+
+        if (items_consumed.load(std::memory_order_relaxed) != num_items) {
+            state.SkipWithError("Not all items were processed");
+            break;
+        }
+    }
+
+    const size_t num_items = std::min<size_t>(static_cast<size_t>(state.range(0)), QueueSize * 5);
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_items * 2));
+    if constexpr (std::is_same_v<T, std::string>) {
+        state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * num_items * sizeof("test-string") * 2));
+    } else {
+        state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * num_items * sizeof(T) * 2));
+    }
+}
+
+// Benchmark for bulk dequeue operations
+template<size_t QueueSize>
+static void BM_BulkDequeue(benchmark::State& state) {
+    SPSCQueue<int, QueueSize> queue;
+    const auto bulk_size = static_cast<size_t>(state.range(1));
+    std::vector<int> items(bulk_size);
+
+    // Ensure we don't exceed queue capacity
+    const size_t num_items = std::min<size_t>(static_cast<size_t>(state.range(0)), queue.capacity());
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Fill the queue
+        size_t enqueued = 0;
+        for (size_t i = 0; i < num_items; ++i) {
+            if (queue.try_enqueue(static_cast<int>(i))) {
+                enqueued++;
+            }
+        }
+        state.ResumeTiming();
+
+        // Dequeue in bulk
+        size_t total_dequeued = 0;
+        while (total_dequeued < enqueued) {
+            size_t batch_size = std::min(bulk_size, enqueued - total_dequeued);
+            size_t dequeued = queue.try_dequeue_bulk(items.begin(), batch_size);
+            if (dequeued == 0)
+                break;  // Avoid infinite loop if dequeue fails
+            total_dequeued += dequeued;
+        }
+
+        if (total_dequeued != enqueued) {
+            state.SkipWithError("Not all items were dequeued");
+            break;
+        }
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_items));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * num_items * sizeof(int)));
+}
+
+// Benchmark comparing individual vs bulk dequeue
+static void BM_IndividualVsBulkDequeue(benchmark::State& state) {
+    constexpr size_t QueueSize = 1024;
+    SPSCQueue<int, QueueSize> queue;
+    const bool use_bulk = state.range(0) == 1;
+
+    // Make sure we don't exceed queue capacity
+    const int total_items = std::min(10000, static_cast<int>(queue.capacity()));
+    const int bulk_size = std::min(100, total_items);
+    std::vector<int> items(bulk_size);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Fill the queue
+        size_t enqueued = 0;
+        for (int i = 0; i < total_items; ++i) {
+            if (queue.try_enqueue(i)) {
+                enqueued++;
+            }
+        }
+        state.ResumeTiming();
+
+        size_t dequeued = 0;
+        if (use_bulk) {
+            // Bulk dequeue
+            while (dequeued < enqueued) {
+                size_t batch_size = std::min(static_cast<size_t>(bulk_size), enqueued - dequeued);
+                size_t batch_dequeued = queue.try_dequeue_bulk(items.begin(), batch_size);
+                if (batch_dequeued == 0)
+                    break;  // Avoid infinite loop
+                dequeued += batch_dequeued;
+            }
+        } else {
+            // Individual dequeue
+            int item;
+            for (size_t i = 0; i < enqueued; ++i) {
+                if (queue.try_dequeue(item)) {
+                    dequeued++;
+                } else {
+                    break;  // Stop if dequeue fails
+                }
+            }
+        }
+
+        if (dequeued != enqueued) {
+            state.SkipWithError("Not all items were dequeued");
+            break;
+        }
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * total_items));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * total_items * sizeof(int)));
+}
+
+// Benchmark for latency measurement using std::chrono instead of cycleclock
+template<size_t QueueSize>
+static void BM_EnqueueDequeueLatency(benchmark::State& state) {
+    SPSCQueue<int64_t, QueueSize> queue;
+
+    for (auto _ : state) {
+        auto start = std::chrono::high_resolution_clock::now();
+        bool enq_success = queue.try_enqueue(0);
+
+        int64_t item = 0;
+        bool deq_success = queue.try_dequeue(item);
+
+        auto end = std::chrono::high_resolution_clock::now();
+
+        if (!enq_success || !deq_success) {
+            state.SkipWithError("Enqueue or dequeue failed");
+            break;
+        }
+
+        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+        state.SetIterationTime(duration.count() / 1e9);
+    }
+}
+
+// Benchmark for emplace performance
+template<size_t QueueSize>
+static void BM_EmplaceVsEnqueue(benchmark::State& state) {
+    SPSCQueue<std::pair<int, std::string>, QueueSize> queue;
+    const bool use_emplace = state.range(0) == 1;
+
+    // Make sure we don't exceed queue capacity
+    const auto num_items = static_cast<int>(std::min<size_t>(static_cast<size_t>(state.range(1)), static_cast<size_t>(queue.capacity())));
+
+    for (auto _ : state) {
+        // Track successful operations
+        size_t enqueued = 0;
+
+        if (use_emplace) {
+            // Use emplace
+            for (int i = 0; i < num_items; ++i) {
+                if (queue.try_emplace(i, "test-string")) {
+                    enqueued++;
+                }
+            }
+        } else {
+            // Use regular enqueue with constructor
+            for (int i = 0; i < num_items; ++i) {
+                if (queue.try_enqueue(std::make_pair(i, "test-string"))) {
+                    enqueued++;
+                }
+            }
+        }
+
+        // Dequeue all items
+        std::pair<int, std::string> item;
+        size_t dequeued = 0;
+        for (size_t i = 0; i < enqueued; ++i) {
+            if (queue.try_dequeue(item)) {
+                dequeued++;
+            }
+        }
+
+        if (dequeued != enqueued) {
+            state.SkipWithError("Not all items were dequeued");
+            break;
+        }
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_items * 2));
+}
+
+// Benchmark for bulk enqueue operations
+template<size_t QueueSize>
+static void BM_BulkEnqueue(benchmark::State& state) {
+    SPSCQueue<int, QueueSize> queue;
+    const auto bulk_size = static_cast<size_t>(state.range(1));
+    std::vector<int> items(bulk_size);
+
+    // Fill the items vector with test data
+    for (size_t i = 0; i < bulk_size; ++i) {
+        items[i] = static_cast<int>(i);
+    }
+
+    // Ensure we don't exceed queue capacity
+    const size_t num_operations = std::min<size_t>(static_cast<size_t>(state.range(0)), QueueSize / bulk_size);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Clear the queue before each measurement
+        int temp;
+        while (queue.try_dequeue(temp)) {}
+        state.ResumeTiming();
+
+        // Enqueue in bulk
+        size_t total_enqueued = 0;
+        for (size_t i = 0; i < num_operations; ++i) {
+            size_t enqueued = queue.try_enqueue_bulk(items.begin(), bulk_size);
+            total_enqueued += enqueued;
+
+            // If we couldn't enqueue the full batch, break to avoid infinite loop
+            if (enqueued < bulk_size)
+                break;
+        }
+
+        // Make sure we dequeue everything for the next iteration
+        state.PauseTiming();
+        while (queue.try_dequeue(temp)) {}
+        state.ResumeTiming();
+
+        // Record how many items we processed
+        benchmark::DoNotOptimize(total_enqueued);
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * bulk_size * num_operations));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * bulk_size * num_operations * sizeof(int)));
+}
+
+// Benchmark comparing individual vs bulk enqueue
+static void BM_IndividualVsBulkEnqueue(benchmark::State& state) {
+    constexpr size_t QueueSize = 1024;
+    SPSCQueue<int, QueueSize> queue;
+    const bool use_bulk = state.range(0) == 1;
+
+    // Make sure we don't exceed queue capacity
+    const int total_items = std::min(10000, static_cast<int>(queue.capacity()));
+    const int bulk_size = std::min(100, total_items);
+    std::vector<int> items(bulk_size);
+
+    // Fill the items vector with test data
+    for (int i = 0; i < bulk_size; ++i) {
+        items[i] = i;
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Clear the queue before each measurement
+        int temp;
+        while (queue.try_dequeue(temp)) {}
+        state.ResumeTiming();
+
+        size_t enqueued = 0;
+        if (use_bulk) {
+            // Bulk enqueue
+            for (size_t i = 0; i < total_items; i += bulk_size) {
+                size_t batch_size = std::min(static_cast<size_t>(bulk_size), static_cast<size_t>(total_items) - i);
+                size_t batch_enqueued = queue.try_enqueue_bulk(items.begin(), batch_size);
+                if (batch_enqueued < batch_size)
+                    break;  // Stop if queue is full
+                enqueued += batch_enqueued;
+            }
+        } else {
+            // Individual enqueue
+            for (int i = 0; i < total_items; ++i) {
+                if (queue.try_enqueue(i)) {
+                    enqueued++;
+                } else {
+                    break;  // Stop if queue is full
+                }
+            }
+        }
+
+        // Empty the queue for the next iteration
+        state.PauseTiming();
+        while (queue.try_dequeue(temp)) {}
+        state.ResumeTiming();
+
+        benchmark::DoNotOptimize(enqueued);
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * total_items));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * total_items * sizeof(int)));
+}
+
+// Benchmark for blocking bulk enqueue with varying queue fullness
+template<size_t QueueSize>
+static void BM_BlockingBulkEnqueue(benchmark::State& state) {
+    SPSCQueue<int, QueueSize> queue;
+
+    // Pre-fill the queue to a certain percentage of capacity
+    const double fill_percentage = state.range(0) / 100.0;
+    const size_t fill_count = static_cast<size_t>(queue.capacity() * fill_percentage);
+
+    // Calculate how many more items we can safely enqueue
+    // Add 1 to ensure we have at least one item to enqueue
+    const size_t max_safe_to_enqueue = std::max<size_t>(1, queue.capacity() - fill_count);
+    // Limit batch size to avoid deadlocks
+    const size_t batch_size = std::min<size_t>(50, max_safe_to_enqueue);
+
+    std::vector<int> items(batch_size);
+
+    // Fill the items vector with test data
+    for (size_t i = 0; i < items.size(); ++i) {
+        items[i] = static_cast<int>(i);
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Clear the queue
+        int temp;
+        while (queue.try_dequeue(temp)) {}
+
+        // Pre-fill the queue to the specified percentage
+        for (size_t i = 0; i < fill_count; ++i) {
+            queue.try_enqueue(static_cast<int>(i));
+        }
+        state.ResumeTiming();
+
+        // Perform blocking bulk enqueue operation with a timeout to prevent deadlocks
+        size_t enqueued = queue.enqueue_bulk_for(items.begin(), items.size(), std::chrono::milliseconds(100));
+
+        // Empty the queue for the next iteration
+        state.PauseTiming();
+        while (queue.try_dequeue(temp)) {}
+        state.ResumeTiming();
+
+        benchmark::DoNotOptimize(enqueued);
+    }
+
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * batch_size));
+    state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * batch_size * sizeof(int)));
+}
+
+// Register the benchmarks
+BENCHMARK(BM_TrivialEnqueueDequeue<16>)->Range(1, 1 << 10);
+BENCHMARK(BM_TrivialEnqueueDequeue<128>)->Range(1, 1 << 10);
+BENCHMARK(BM_TrivialEnqueueDequeue<1024>)->Range(1, 1 << 10);
+
+BENCHMARK(BM_NonTrivialEnqueueDequeue<16>)->Range(1, 1 << 10);
+BENCHMARK(BM_NonTrivialEnqueueDequeue<128>)->Range(1, 1 << 10);
+BENCHMARK(BM_NonTrivialEnqueueDequeue<1024>)->Range(1, 1 << 10);
+
+BENCHMARK(BM_ProducerConsumer<128, int>)->Range(1000, 100000);
+BENCHMARK(BM_ProducerConsumer<128, std::string>)->Range(1000, 100000);
+
+BENCHMARK(BM_BulkDequeue<1024>)
+    ->Args({10000, 1})     // Total items, bulk size of 1
+    ->Args({10000, 10})    // Total items, bulk size of 10
+    ->Args({10000, 50})    // Total items, bulk size of 50
+    ->Args({10000, 100});  // Total items, bulk size of 100
+
+BENCHMARK(BM_IndividualVsBulkDequeue)
+    ->Arg(0)   // Use individual dequeue
+    ->Arg(1);  // Use bulk dequeue
+
+BENCHMARK(BM_EnqueueDequeueLatency<16>)->UseRealTime();
+BENCHMARK(BM_EnqueueDequeueLatency<128>)->UseRealTime();
+BENCHMARK(BM_EnqueueDequeueLatency<1024>)->UseRealTime();
+
+BENCHMARK(BM_EmplaceVsEnqueue<128>)
+    ->Args({0, 1000})   // Regular enqueue, 1000 items
+    ->Args({1, 1000});  // Emplace, 1000 items
+
+BENCHMARK(BM_BulkEnqueue<1024>)
+    ->Args({100, 1})     // 100 operations, bulk size of 1
+    ->Args({100, 10})    // 100 operations, bulk size of 10
+    ->Args({100, 50})    // 100 operations, bulk size of 50
+    ->Args({100, 100});  // 100 operations, bulk size of 100
+
+BENCHMARK(BM_IndividualVsBulkEnqueue)
+    ->Arg(0)   // Use individual enqueue
+    ->Arg(1);  // Use bulk enqueue
+
+BENCHMARK(BM_BlockingBulkEnqueue<128>)
+    ->Arg(0)    // Queue 0% full
+    ->Arg(25)   // Queue 25% full
+    ->Arg(50)   // Queue 50% full
+    ->Arg(75)   // Queue 75% full
+    ->Arg(95);  // Queue 95% full
+
+BENCHMARK(BM_BlockingBulkEnqueue<1024>)
+    ->Arg(0)    // Queue 0% full
+    ->Arg(25)   // Queue 25% full
+    ->Arg(50)   // Queue 50% full
+    ->Arg(75)   // Queue 75% full
+    ->Arg(95);  // Queue 95% full
+
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/benchmarks/SynchronousInferenceBenchmark.cpp b/benchmarks/SynchronousInferenceBenchmark.cpp
index b6cc107..62ca772 100644
--- a/benchmarks/SynchronousInferenceBenchmark.cpp
+++ b/benchmarks/SynchronousInferenceBenchmark.cpp
@@ -32,9 +32,7 @@ namespace Finn {
 
 template<bool SynchronousInference>
 Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
-    Finn::Driver<SynchronousInference> driver(configFilePath, batchSize);
-    driver.setForceAchieval(true);
-    return driver;
+    return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
 }
 
 static void BM_SynchronousInference(benchmark::State& state) {
diff --git a/benchmarks/expectedPerformance.json b/benchmarks/expectedPerformance.json
index 0ff326f..28cd9ed 100644
--- a/benchmarks/expectedPerformance.json
+++ b/benchmarks/expectedPerformance.json
@@ -32,7 +32,11 @@
         "num_sharing": 8
       }
     ],
-    "load_avg": [4.49,4.48,4.3],
+    "load_avg": [
+      4.49,
+      4.48,
+      4.3
+    ],
     "library_version": "v1.9.1",
     "library_build_type": "release",
     "json_schema_version": 1
diff --git a/buildDependencies.sh b/buildDependencies.sh
deleted file mode 100755
index d77055c..0000000
--- a/buildDependencies.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-#Setup
-git submodule update --init --recursive
-
-
diff --git a/cmake/AddBenchmark.cmake b/cmake/AddBenchmark.cmake
index d287ffc..b93ab65 100644
--- a/cmake/AddBenchmark.cmake
+++ b/cmake/AddBenchmark.cmake
@@ -14,14 +14,10 @@ function(add_benchmark benchmark_name)
   target_link_libraries(${benchmark}
     PUBLIC
     finnc_options
-    ${Boost_LIBRARIES}
-    finnc_utils
     finnc_core_test
     xrt_mock
     benchmark::benchmark
     OpenMP::OpenMP_CXX
   )
 
-  target_link_directories(${benchmark} PRIVATE ${BOOST_LIBRARYDIR})
-
 endfunction()
diff --git a/cmake/AddIntegrationtest.cmake b/cmake/AddIntegrationtest.cmake
index 2551354..5061aff 100644
--- a/cmake/AddIntegrationtest.cmake
+++ b/cmake/AddIntegrationtest.cmake
@@ -13,8 +13,8 @@ function(add_integrationtest integrationtest_name)
   #target_compile_definitions(${integrationtest} PRIVATE UNITTEST=1)
 
   target_include_directories(${integrationtest} SYSTEM PRIVATE ${XRT_INCLUDE_DIRS} ${FINN_SRC_DIR})
-  target_link_directories(${integrationtest} PRIVATE ${XRT_LIB_CORE_LOCATION} ${XRT_LIB_OCL_LOCATION} ${BOOST_LIBRARYDIR})
-  target_link_libraries(${integrationtest} PRIVATE gtest finnc_core finnc_options Threads::Threads OpenCL xrt_coreutil uuid finnc_utils ${Boost_LIBRARIES} nlohmann_json::nlohmann_json OpenMP::OpenMP_CXX)
+  target_link_directories(${integrationtest} PRIVATE ${XRT_LIB_CORE_LOCATION} ${XRT_LIB_OCL_LOCATION})
+  target_link_libraries(${integrationtest} PRIVATE gtest finnc_core finnc_options Threads::Threads OpenCL xrt_coreutil uuid nlohmann_json::nlohmann_json OpenMP::OpenMP_CXX)
 
 
 endfunction()
diff --git a/cmake/AddUnittest.cmake b/cmake/AddUnittest.cmake
index 62020a5..2192129 100644
--- a/cmake/AddUnittest.cmake
+++ b/cmake/AddUnittest.cmake
@@ -14,14 +14,12 @@ function(add_unittest test_name)
     PUBLIC
     gtest
     finnc_options
-    ${Boost_LIBRARIES}
-    finnc_utils
     finnc_core_test
     xrt_mock
     OpenMP::OpenMP_CXX
   )
 
-  target_link_directories(${test} PRIVATE ${BOOST_LIBRARYDIR})
+  target_link_directories(${test} PRIVATE)
 
   target_include_directories(${test} PRIVATE ${FINN_SRC_DIR})
 
diff --git a/external/plog b/external/plog
new file mode 160000
index 0000000..cffccc4
--- /dev/null
+++ b/external/plog
@@ -0,0 +1 @@
+Subproject commit cffccc4bcfcfa8ef19696c13b07561c7b75aa9bf
diff --git a/external/popl b/external/popl
new file mode 160000
index 0000000..bda5f43
--- /dev/null
+++ b/external/popl
@@ -0,0 +1 @@
+Subproject commit bda5f43099d67419089a44c9e54474e4998a9a26
diff --git a/external/xsimd b/external/xsimd
index a48ab43..df83e16 160000
--- a/external/xsimd
+++ b/external/xsimd
@@ -1 +1 @@
-Subproject commit a48ab430d4b84ecd5449180ee1c6d2eed67c4191
+Subproject commit df83e16489673cd6ef2237fa98ea288384dab74e
diff --git a/external/xtensor b/external/xtensor
index cd06b35..5598564 160000
--- a/external/xtensor
+++ b/external/xtensor
@@ -1 +1 @@
-Subproject commit cd06b35beeae43bea09a03f50bcda16b4fa4f6c0
+Subproject commit 5598564ae6785bd9fffe22664f2e650ce3a991c7
diff --git a/external/xtl b/external/xtl
index b072176..174e99d 160000
--- a/external/xtl
+++ b/external/xtl
@@ -1 +1 @@
-Subproject commit b072176e173216880f4342c1bb390cb3ee0351c5
+Subproject commit 174e99da6e27c3ee5837fb74e7e65330af4cd2a6
diff --git a/integrationtest/SyncInference.cpp b/integrationtest/SyncInference.cpp
index 17353b6..9a34c10 100644
--- a/integrationtest/SyncInference.cpp
+++ b/integrationtest/SyncInference.cpp
@@ -26,9 +26,9 @@ TEST(SyncInference, syncInferenceTest) {
     std::string exampleNetworkConfig = "jetConfig.json";
     Finn::Config conf = Finn::createConfigFromPath(exampleNetworkConfig);
 
-    auto driver = Finn::Driver<true>(conf, 0, conf.deviceWrappers[0].idmas[0]->kernelName, 0, conf.deviceWrappers[0].odmas[0]->kernelName, 1, true);
+    auto driver = Finn::Driver<true>(conf, 0, conf.deviceWrappers[0].idmas[0]->kernelName, 0, conf.deviceWrappers[0].odmas[0]->kernelName, 1);
 
-    Finn::vector<int8_t> data(driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, conf.deviceWrappers[0].idmas[0]->kernelName), 1);
+    Finn::vector<int8_t> data(driver.getFeatureMapSize(0, conf.deviceWrappers[0].idmas[0]->kernelName), 1);
 
     std::iota(data.begin(), data.end(), -127);
 
@@ -45,13 +45,13 @@ TEST(SyncInference, syncBatchInferenceTest) {
     Finn::Config conf = Finn::createConfigFromPath(exampleNetworkConfig);
     std::size_t batchLength = 10;
 
-    auto driver = Finn::Driver<true>(conf, 0, conf.deviceWrappers[0].idmas[0]->kernelName, 0, conf.deviceWrappers[0].odmas[0]->kernelName, static_cast<uint>(batchLength), true);
+    auto driver = Finn::Driver<true>(conf, 0, conf.deviceWrappers[0].idmas[0]->kernelName, 0, conf.deviceWrappers[0].odmas[0]->kernelName, static_cast<uint>(batchLength));
 
-    Finn::vector<int8_t> data(driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, conf.deviceWrappers[0].idmas[0]->kernelName) * batchLength, 1);
+    Finn::vector<int8_t> data(driver.getFeatureMapSize(0, conf.deviceWrappers[0].idmas[0]->kernelName) * batchLength, 1);
 
     for (std::size_t i = 0; i < batchLength; ++i) {
-        std::iota(data.begin() + static_cast<decltype(data)::difference_type>(i * driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, conf.deviceWrappers[0].idmas[0]->kernelName)),
-                  data.begin() + static_cast<decltype(data)::difference_type>((i + 1) * driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, conf.deviceWrappers[0].idmas[0]->kernelName)), -127);
+        std::iota(data.begin() + static_cast<decltype(data)::difference_type>(i * driver.getFeatureMapSize(0, conf.deviceWrappers[0].idmas[0]->kernelName)),
+                  data.begin() + static_cast<decltype(data)::difference_type>((i + 1) * driver.getFeatureMapSize(0, conf.deviceWrappers[0].idmas[0]->kernelName)), -127);
     }
 
     // Run inference
@@ -68,6 +68,7 @@ TEST(SyncInference, syncBatchInferenceTest) {
 
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
+    Logger::initLogger(true);
 
     return RUN_ALL_TESTS();
 }
\ No newline at end of file
diff --git a/iwyu.imp b/iwyu.imp
index e3476e7..76a07e3 100644
--- a/iwyu.imp
+++ b/iwyu.imp
@@ -1,5 +1,5 @@
 [
-{ include: ["@<boost/log/.*>", "private", "<FINNCppDriver/utils/Logger.h>", "public"] },
+{ include: ["@<plog/.*>", "private", "<FINNCppDriver/utils/Logger.hpp>", "public"] },
 { include: [ "<boost/log/core/record.hpp>", "private", "<boost/log/trivial.hpp>", "private" ] },
 { include: [ "<boost/log/sources/record_ostream.hpp>", "private", "<boost/log/trivial.hpp>", "private" ] },
 { include: [ "<boost/log/sinks/basic_sink_frontend.hpp>", "private", "<boost/log/trivial.hpp>", "private" ]},
diff --git a/scripts/init_submodules.sh b/scripts/init_submodules.sh
old mode 100755
new mode 100644
diff --git a/scripts/install_precommit.sh b/scripts/install_precommit.sh
old mode 100755
new mode 100644
diff --git a/scripts/pre-commit.template b/scripts/pre-commit.template
old mode 100755
new mode 100644
diff --git a/src/FINNCppDriver/CMakeLists.txt b/src/FINNCppDriver/CMakeLists.txt
index a4c0be8..3db00bf 100644
--- a/src/FINNCppDriver/CMakeLists.txt
+++ b/src/FINNCppDriver/CMakeLists.txt
@@ -1,12 +1,12 @@
-add_subdirectory(utils)
+#add_subdirectory(utils)
 add_subdirectory(core)
 add_subdirectory(config) 
 
 add_executable(finnhpc FINNDriver.cpp)
 target_include_directories(finnhpc SYSTEM PRIVATE ${XRT_INCLUDE_DIRS})
 target_include_directories(finnhpc PRIVATE ${FINN_SRC_DIR})
-target_link_directories(finnhpc PRIVATE ${XRT_LIB_CORE_LOCATION} ${XRT_LIB_OCL_LOCATION} ${BOOST_LIBRARYDIR})
-target_link_libraries(finnhpc PRIVATE finnc_core finnc_options Threads::Threads OpenCL xrt_coreutil uuid finnc_utils finn_config ${Boost_LIBRARIES} nlohmann_json::nlohmann_json OpenMP::OpenMP_CXX)
+target_link_directories(finnhpc PRIVATE ${XRT_LIB_CORE_LOCATION} ${XRT_LIB_OCL_LOCATION})
+target_link_libraries(finnhpc PRIVATE finnc_core finnc_options Threads::Threads OpenCL xrt_coreutil uuid finn_config nlohmann_json::nlohmann_json OpenMP::OpenMP_CXX)
 set_target_properties(finnhpc
     PROPERTIES
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
diff --git a/src/FINNCppDriver/FINNDriver.cpp b/src/FINNCppDriver/FINNDriver.cpp
index 0aeb29d..844dc9c 100644
--- a/src/FINNCppDriver/FINNDriver.cpp
+++ b/src/FINNCppDriver/FINNDriver.cpp
@@ -31,19 +31,19 @@
 #include <FINNCppDriver/utils/ConfigurationStructs.h>  // for Config
 #include <FINNCppDriver/utils/DoNotOptimize.h>         // for DoNotOptimize
 #include <FINNCppDriver/utils/FinnUtils.h>             // for logAndError
-#include <FINNCppDriver/utils/Logger.h>                // for FINN_LOG, ...
 #include <FINNCppDriver/utils/Types.h>                 // for shape_t
 
 #include <FINNCppDriver/core/BaseDriver.hpp>      // IWYU pragma: keep
 #include <FINNCppDriver/utils/DataPacking.hpp>    // for AutoReturnType
 #include <FINNCppDriver/utils/DynamicMdSpan.hpp>  // for DynamicMdSpan
-#include <boost/program_options.hpp>              // for variables_map
+#include <FINNCppDriver/utils/Logger.hpp>         // for FINN_LOG, ...
 #include <ext/alloc_traits.h>                     // for __alloc_tr...
-#include <xtensor/xadapt.hpp>                     // for adapt
-#include <xtensor/xarray.hpp>                     // for xarray_ada...
-#include <xtensor/xiterator.hpp>                  // for operator==
-#include <xtensor/xlayout.hpp>                    // for layout_type
-#include <xtensor/xnpy.hpp>                       // for dump_npy, ...
+#include <popl.hpp>                               // for program options
+#include <xtensor/containers/xadapt.hpp>          // for adapt
+#include <xtensor/containers/xarray.hpp>          // for xarray_ada...
+#include <xtensor/core/xiterator.hpp>             // for operator==
+#include <xtensor/core/xlayout.hpp>               // for layout_type
+#include <xtensor/io/xnpy.hpp>                    // for dump_npy, ...
 #include <xtl/xiterator_base.hpp>                 // for operator!=
 
 
@@ -96,20 +96,20 @@ std::string finnMainLogPrefix() { return "[FINNDriver] "; }
  * @param device
  * @param filename
  */
-void logDeviceInformation(logger_type& logger, xrt::device& device, const std::string& filename) {
+void logDeviceInformation(xrt::device& device, const std::string& filename) {
     auto bdfInfo = device.get_info<xrt::info::device::bdf>();
-    FINN_LOG(logger, loglevel::info) << "BDF: " << bdfInfo;
+    FINN_LOG(loglevel::info) << "BDF: " << bdfInfo;
     auto xclbin = xrt::xclbin(filename);
     auto kernels = xclbin.get_kernels();
 
     for (auto&& knl : kernels) {
-        FINN_LOG(logger, loglevel::info) << "Kernel: " << knl.get_name() << "\n";
+        FINN_LOG(loglevel::info) << "Kernel: " << knl.get_name() << "\n";
         for (auto&& arg : knl.get_args()) {
-            FINN_LOG(logger, loglevel::info) << "\t\t\tArg: " << arg.get_name() << " Size: " << arg.get_size() << "\n";
+            FINN_LOG(loglevel::info) << "\t\t\tArg: " << arg.get_name() << " Size: " << arg.get_size() << "\n";
         }
 
         for (auto&& compUnit : knl.get_cus()) {
-            FINN_LOG(logger, loglevel::info) << " \t\t\tCU: " << compUnit.get_name() << " Size: " << compUnit.get_size() << "\n";
+            FINN_LOG(loglevel::info) << " \t\t\tCU: " << compUnit.get_name() << " Size: " << compUnit.get_size() << "\n";
         }
     }
 }
@@ -124,9 +124,7 @@ void logDeviceInformation(logger_type& logger, xrt::device& device, const std::s
  */
 template<bool SynchronousInference>
 Finn::Driver<SynchronousInference> createDriverFromConfig(const std::filesystem::path& configFilePath, unsigned int batchSize) {
-    Finn::Driver<SynchronousInference> driver(configFilePath, batchSize);
-    driver.setForceAchieval(true);
-    return driver;
+    return Finn::Driver<SynchronousInference>(configFilePath, batchSize);
 }
 
 template<typename O>
@@ -144,7 +142,7 @@ void runThroughputTestImpl(Finn::Driver<true>& baseDriver, std::size_t elementCo
 
     auto gen = [&dist, &mersenneEngine]() { return dist(mersenneEngine); };
 
-    constexpr size_t nTestruns = 5000;
+    constexpr size_t nTestruns = 10;
     std::chrono::duration<double> sumRuntimeEnd2End{};
 
     // Warmup
@@ -214,14 +212,14 @@ void runThroughputTestImpl(Finn::Driver<true>& baseDriver, std::size_t elementCo
  * @param baseDriver
  * @param logger
  */
-void runThroughputTest(Finn::Driver<true>& baseDriver, logger_type& logger) {
-    FINN_LOG(logger, loglevel::info) << finnMainLogPrefix() << "Device Information: ";
-    logDeviceInformation(logger, baseDriver.getDeviceHandler(0).getDevice(), baseDriver.getConfig().deviceWrappers[0].xclbin);
+void runThroughputTest(Finn::Driver<true>& baseDriver) {
+    FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Device Information: ";
+    logDeviceInformation(baseDriver.getDeviceHandler(0).getDevice(), baseDriver.getConfig().deviceWrappers[0].xclbin);
 
     size_t elementcount = FinnUtils::shapeToElements((std::static_pointer_cast<Finn::ExtendedBufferDescriptor>(baseDriver.getConfig().deviceWrappers[0].idmas[0]))->normalShape);
     uint batchSize = baseDriver.getBatchSize();
-    FINN_LOG(logger, loglevel::info) << finnMainLogPrefix() << "Input element count " << std::to_string(elementcount);
-    FINN_LOG(logger, loglevel::info) << finnMainLogPrefix() << "Batch size: " << batchSize;
+    FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Input element count " << std::to_string(elementcount);
+    FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Batch size: " << batchSize;
 
     constexpr bool isInteger = InputFinnType().isInteger();
     if constexpr (isInteger) {
@@ -333,9 +331,9 @@ void inferUnsignedInteger(Finn::Driver<true>& baseDriver, xt::detail::npy_file&
  * @param inputFiles Files used for inference input
  * @param outputFiles Filenames used for output files
  */
-void runWithInputFile(Finn::Driver<true>& baseDriver, logger_type& logger, const std::vector<std::string>& inputFiles, const std::vector<std::string>& outputFiles) {
-    FINN_LOG(logger, loglevel::info) << finnMainLogPrefix() << "Running driver on input files";
-    logDeviceInformation(logger, baseDriver.getDeviceHandler(0).getDevice(), baseDriver.getConfig().deviceWrappers[0].xclbin);
+void runWithInputFile(Finn::Driver<true>& baseDriver, const std::vector<std::string>& inputFiles, const std::vector<std::string>& outputFiles) {
+    FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Running driver on input files";
+    logDeviceInformation(baseDriver.getDeviceHandler(0).getDevice(), baseDriver.getConfig().deviceWrappers[0].xclbin);
 
     for (auto&& [inp, out] = std::tuple{inputFiles.begin(), outputFiles.begin()}; inp != inputFiles.end(); ++inp, ++out) {
         // load npy file and process it
@@ -384,62 +382,6 @@ void runWithInputFile(Finn::Driver<true>& baseDriver, logger_type& logger, const
     }
 }
 
-/**
- * @brief Validates the user input for the driver mode switch
- *
- * @param mode User input string for selected mode
- */
-void validateDriverMode(const std::string& mode) {
-    if (mode != "execute" && mode != "throughput") {
-        throw boost::program_options::error_with_option_name("'" + mode + "' is not a valid driver mode!", "exec_mode");
-    }
-
-    FINN_LOG(Logger::getLogger(), loglevel::info) << finnMainLogPrefix() << "Driver Mode: " << mode;
-}
-
-/**
- * @brief Validates the user input for the batch size
- *
- * @param batch User input batch size
- */
-void validateBatchSize(int batch) {
-    if (batch <= 0) {
-        throw boost::program_options::error_with_option_name("Batch size must be positive, but is '" + std::to_string(batch) + "'", "batchsize");
-    }
-}
-
-/**
- * @brief Validates the user input for the config path. Also checks if file exists
- *
- * @param path Path string to be validated
- */
-void validateConfigPath(const std::string& path) {
-    auto configFilePath = std::filesystem::path(path);
-    if (!std::filesystem::exists(configFilePath)) {
-        throw boost::program_options::error_with_option_name("Cannot find config file at " + configFilePath.string(), "configpath");
-    }
-
-    FINN_LOG(Logger::getLogger(), loglevel::info) << finnMainLogPrefix() << "Config file found at " << configFilePath.string();
-}
-
-/**
- * @brief Validates the user input for the input file path. Also checks if input file exists.
- *
- * @param path Path string to be validated
- */
-void validateInputPath(const std::vector<std::string>& path) {
-    for (auto&& elem : path) {
-        auto inputFilePath = std::filesystem::path(elem);
-        if (!std::filesystem::exists(inputFilePath)) {
-            throw boost::program_options::error_with_option_name("Cannot find input file at " + inputFilePath.string(), "input");
-        }
-        FINN_LOG(Logger::getLogger(), loglevel::info) << finnMainLogPrefix() << "Input file found at " << inputFilePath.string();
-    }
-}
-
-
-namespace po = boost::program_options;
-
 /**
  * @brief Main entrypoint for the frontend of the C++ Finn driver
  *
@@ -448,58 +390,105 @@ namespace po = boost::program_options;
  * @return int Exit status code
  */
 int main(int argc, char* argv[]) {
-    auto logger = Logger::getLogger();
-    FINN_LOG(logger, loglevel::info) << "C++ Driver started";
+    Logger::initLogger();
+    FINN_LOG(loglevel::info) << "C++ Driver started";
 
     try {
         // Command Line Argument Parser
-        po::options_description desc{"Options"};
-        // clang-format off
-        desc.add_options()("help,h", "Display help")("exec_mode,e", po::value<std::string>()->default_value("throughput")->notifier(&validateDriverMode),
-            R"(Please select functional verification ("execute") or throughput test ("throughput")")("configpath,c", po::value<std::string>()->required()->notifier(&validateConfigPath),
-                "Required: Path to the config.json file emitted by the FINN compiler")(
-                    "input,i", po::value<std::vector<std::string>>()->multitoken()->composing()->notifier(&validateInputPath), "Path to one or more input files (npy format). Only required if mode is set to \"file\"")(
-                        "output,o", po::value<std::vector<std::string>>()->multitoken()->composing(), "Path to one or more output files (npy format). Only required if mode is set to \"file\"")(
-                            "batchsize,b", po::value<int>()->default_value(1)->notifier(&validateBatchSize), "Number of samples for inference")("check", "Outputs the compile time configuration");
-        // clang-format on
-        po::variables_map varMap;
-        po::store(po::parse_command_line(argc, argv, desc), varMap);
+        popl::OptionParser options("Options");
+
+        auto help_option = options.add<popl::Switch>("h", "help", "Display help");
+        auto mode_option = options.add<popl::Value<std::string>>("e", "exec_mode", R"(Please select functional verification ("execute") or throughput test ("throughput")", "throughput");
+        auto config_option = options.add<popl::Value<std::string>>("c", "configpath", "Required: Path to the config.json file emitted by the FINN compiler");
+        auto input_option = options.add<popl::Value<std::string>>("i", "input", "Path to one or more input files (npy format). Only required if mode is set to \"file\"");
+        auto output_option = options.add<popl::Value<std::string>>("o", "output", "Path to one or more output files (npy format). Only required if mode is set to \"file\"");
+        auto batch_option = options.add<popl::Value<unsigned>>("b", "batchsize", "Number of samples for inference", 1);
+        auto check_option = options.add<popl::Switch>("", "check", "Outputs the compile time configuration");
+
+        options.parse(argc, argv);
 
         // Display help screen
-        // Help option has to be processed before po::notify call to not enforce required options in combination with help
-        if (varMap.count("help") != 0) {
-            std::cout << desc << "\n";
+        if (help_option->is_set()) {
+            std::cout << options << "\n";
             return 0;
         }
 
-        if (varMap.count("check") != 0) {
+        if (check_option->is_set()) {
             std::cout << "input_t: " << Finn::type_name<InputFinnType>() << "\n";
             std::cout << "output_t: " << Finn::type_name<OutputFinnType>() << "\n";
             return 0;
         }
 
-        po::notify(varMap);
 
-        FINN_LOG(logger, loglevel::info) << finnMainLogPrefix() << "Parsed command line params";
+        if (mode_option->count() > 1) {
+            throw std::runtime_error("Command Line Argument Error: exec_mode can only be set once!");
+        }
+
+        std::string mode = mode_option->value();
+
+        if (mode != "execute" && mode != "throughput") {
+            throw std::runtime_error("Command Line Argument Error:'" + mode + "' is not a valid driver mode!");
+        }
+
+        FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Driver Mode: " << mode;
+
+
+        if (config_option->is_set()) {
+            if (config_option->count() != 1) {
+                throw std::runtime_error("Command Line Argument Error: configpath can only be set once!");
+            }
+
+            auto configFilePath = std::filesystem::path(config_option->value());
+            if (!std::filesystem::exists(configFilePath)) {
+                throw std::runtime_error("Command Line Argument Error: Cannot find config file at " + configFilePath.string());
+            }
+
+            FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Config file found at " << configFilePath.string();
+        } else {
+            throw std::runtime_error("Command Line Argument Error: configpath is required to be set!");
+        }
+
+        if (input_option->is_set()) {
+            for (size_t i = 0; i < input_option->count(); ++i) {
+                std::string elem = input_option->value(i);
+                auto inputFilePath = std::filesystem::path(elem);
+                if (!std::filesystem::exists(inputFilePath)) {
+                    throw std::runtime_error("Command Line Argument Error: Cannot find input file at " + inputFilePath.string());
+                }
+                FINN_LOG_DEBUG(loglevel::info) << finnMainLogPrefix() << "Input file found at " << inputFilePath.string();
+            }
+        }
+
+        FINN_LOG(loglevel::info) << finnMainLogPrefix() << "Parsed command line params";
 
         // Switch on modes
-        if (varMap["exec_mode"].as<std::string>() == "execute") {
-            if (varMap.count("input") == 0) {
+        if (mode_option->value() == "execute") {
+            if (!input_option->is_set()) {
                 FinnUtils::logAndError<std::invalid_argument>("No input file(s) specified for file execution mode!");
             }
-            if (varMap.count("output") == 0) {
+            if (!output_option->is_set()) {
                 FinnUtils::logAndError<std::invalid_argument>("No output file(s) specified for file execution mode!");
             }
-            if (varMap.count("input") != varMap.count("output")) {
+            if (input_option->count() != output_option->count()) {
                 FinnUtils::logAndError<std::invalid_argument>("Same amount of input and output files required!");
             }
-            auto driver = createDriverFromConfig<true>(varMap["configpath"].as<std::string>(), static_cast<uint>(varMap["batchsize"].as<int>()));
-            runWithInputFile(driver, logger, varMap["input"].as<std::vector<std::string>>(), varMap["output"].as<std::vector<std::string>>());
-        } else if (varMap["exec_mode"].as<std::string>() == "throughput") {
-            auto driver = createDriverFromConfig<true>(varMap["configpath"].as<std::string>(), static_cast<uint>(varMap["batchsize"].as<int>()));
-            runThroughputTest(driver, logger);
+
+            std::vector<std::string> inputVec;
+            for (size_t i = 0; i < input_option->count(); ++i) {
+                inputVec.emplace_back(input_option->value(i));
+            }
+            std::vector<std::string> outputVec;
+            for (size_t i = 0; i < output_option->count(); ++i) {
+                outputVec.emplace_back(output_option->value(i));
+            }
+
+            auto driver = createDriverFromConfig<true>(config_option->value(), batch_option->value());
+            runWithInputFile(driver, inputVec, outputVec);
+        } else if (mode_option->value() == "throughput") {
+            auto driver = createDriverFromConfig<true>(config_option->value(), batch_option->value());
+            runThroughputTest(driver);
         } else {
-            FinnUtils::logAndError<std::invalid_argument>("Unknown driver mode: " + varMap["exec_mode"].as<std::string>());
+            FinnUtils::logAndError<std::invalid_argument>("Unknown driver mode: " + mode_option->value());
         }
 
         return 1;
diff --git a/src/FINNCppDriver/config/FinnDatatypesJet.h b/src/FINNCppDriver/config/FinnDatatypesJet.h
deleted file mode 100644
index bdff338..0000000
--- a/src/FINNCppDriver/config/FinnDatatypesJet.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// THIS FILE IS AUTOGENERATED BY THE FINN COMPILER
-
-#ifndef FINNDRIVERUSEDDATATYPES
-#define FINNDRIVERUSEDDATATYPES
-#include "../core/BaseDriver.hpp"
-#include "../utils/FinnDatatypes.hpp"
-
-using InputFinnType = Finn::DatatypeInt<8>;
-using OutputFinnType = Finn::DatatypeInt<16>;
-
-namespace Finn {
-    template<bool SynchronousInference>
-    using Driver = Finn::BaseDriver<SynchronousInference, InputFinnType, OutputFinnType>;
-}  // namespace Finn
-
-
-#endif  // FINNDRIVERUSEDDATATYPES
diff --git a/src/FINNCppDriver/config/jetConfig.json b/src/FINNCppDriver/config/jetConfig.json
deleted file mode 100644
index 5c27396..0000000
--- a/src/FINNCppDriver/config/jetConfig.json
+++ /dev/null
@@ -1,45 +0,0 @@
-[
-  {
-    "xrtDeviceIndex": 0,
-    "xclbinPath": "/scratch/hpc-prf-ekiapp/linusjun/finn_dev/FINN_WORKDIR/finn-on-n2/model-final/out_dir_nomt_80/bitfile/finn-accel.xclbin",
-    "name": "MainDevice",
-    "idmas": [
-      {
-        "kernelName": "StreamingDataflowPartition_0:{idma0}",
-        "normalShape": [
-          1,
-          24
-        ],
-        "foldedShape": [
-          1,
-          1,
-          24
-        ],
-        "packedShape": [
-          1,
-          1,
-          24
-        ]
-      }
-    ],
-    "odmas": [
-      {
-        "kernelName": "StreamingDataflowPartition_2:{odma0}",
-        "normalShape": [
-          1,
-          5
-        ],
-        "foldedShape": [
-          1,
-          1,
-          5
-        ],
-        "packedShape": [
-          1,
-          1,
-          10
-        ]
-      }
-    ]
-  }
-]
\ No newline at end of file
diff --git a/src/FINNCppDriver/core/Accelerator.cpp b/src/FINNCppDriver/core/Accelerator.cpp
index e00ef40..b2e1bfc 100644
--- a/src/FINNCppDriver/core/Accelerator.cpp
+++ b/src/FINNCppDriver/core/Accelerator.cpp
@@ -15,18 +15,18 @@
 #include <FINNCppDriver/core/DeviceHandler.h>          // for DeviceHandler, UncheckedStore, ...
 #include <FINNCppDriver/utils/ConfigurationStructs.h>  // IWYU pragma: keep
 #include <FINNCppDriver/utils/FinnUtils.h>             // for logAndError, unreachable
-#include <FINNCppDriver/utils/Logger.h>                // for operator<<, DevNull
 
-#include <algorithm>  // for count_if, find_if, tra...
-#include <cstddef>    // for size_t
-#include <iterator>   // for back_insert_iterator
-#include <stdexcept>  // for runtime_error
+#include <FINNCppDriver/utils/Logger.hpp>  // for operator<<, DevNull
+#include <algorithm>                       // for count_if, find_if, tra...
+#include <cstddef>                         // for size_t
+#include <iterator>                        // for back_insert_iterator
+#include <stdexcept>                       // for runtime_error
 
 namespace Finn {
     std::string Accelerator::loggerPrefix() { return "[Accelerator] "; }
 
     Accelerator::Accelerator(const std::vector<DeviceWrapper>& deviceDefinitions, bool synchronousInference, unsigned int hostBufferSize) {
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Constructing Accelerator\n";
+        FINN_LOG(loglevel::info) << loggerPrefix() << "Constructing Accelerator\n";
         std::transform(deviceDefinitions.begin(), deviceDefinitions.end(), std::back_inserter(devices), [hostBufferSize, synchronousInference](const DeviceWrapper& dew) { return DeviceHandler(dew, synchronousInference, hostBufferSize); });
     }
 
@@ -102,14 +102,14 @@ namespace Finn {
         return ret;
     }
 
-    Finn::vector<uint8_t> Accelerator::getOutputData(const unsigned int deviceIndex, const std::string& outputBufferKernelName, bool forceArchival) {
+    Finn::vector<uint8_t> Accelerator::getOutputData(const unsigned int deviceIndex, const std::string& outputBufferKernelName) {
         if (containsDevice(deviceIndex)) {
-            FINN_LOG_DEBUG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Retrieving results from the specified device index! [accelerator.retrieveResults()]";
-            return getDeviceHandler(deviceIndex).retrieveResults(outputBufferKernelName, forceArchival);
+            FINN_LOG_DEBUG(loglevel::info) << loggerPrefix() << "Retrieving results from the specified device index! [accelerator.retrieveResults()]";
+            return getDeviceHandler(deviceIndex).retrieveResults(outputBufferKernelName);
         } else {
             if (containsDevice(0)) {
-                FINN_LOG_DEBUG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Retrieving results from 0  device index! [accelerator.retrueveResults()]";
-                return getDeviceHandler(0).retrieveResults(outputBufferKernelName, forceArchival);
+                FINN_LOG_DEBUG(loglevel::info) << loggerPrefix() << "Retrieving results from 0  device index! [accelerator.retrieveResults()]";
+                return getDeviceHandler(0).retrieveResults(outputBufferKernelName);
             } else {
                 // cppcheck-suppress missingReturn
                 FinnUtils::logAndError<std::runtime_error>("Tried receiving data in a devicehandler with an invalid deviceIndex!");
@@ -117,6 +117,31 @@ namespace Finn {
         }
     }
 
-    size_t Accelerator::size(SIZE_SPECIFIER ss, unsigned int deviceIndex, const std::string& bufferName) { return getDeviceHandler(deviceIndex).size(ss, bufferName); }
+    size_t Accelerator::getSizeInBytes(unsigned int deviceIndex, const std::string& bufferName) {
+        if (containsDevice(deviceIndex)) {
+            return getDeviceHandler(deviceIndex).getSizeInBytes(bufferName);
+        }
+        return 0;
+    }
+
+    size_t Accelerator::getFeatureMapSize(unsigned int deviceIndex, const std::string& bufferName) {
+        if (containsDevice(deviceIndex)) {
+            return getDeviceHandler(deviceIndex).getFeatureMapSize(bufferName);
+        }
+        return 0;
+    }
 
+    size_t Accelerator::getBatchSize(unsigned int deviceIndex, const std::string& bufferName) {
+        if (containsDevice(deviceIndex)) {
+            return getDeviceHandler(deviceIndex).getBatchSize(bufferName);
+        }
+        return 0;
+    }
+
+    size_t Accelerator::getTotalDataSize(unsigned int deviceIndex, const std::string& bufferName) {
+        if (containsDevice(deviceIndex)) {
+            return getDeviceHandler(deviceIndex).getTotalDataSize(bufferName);
+        }
+        return 0;
+    }
 }  // namespace Finn
diff --git a/src/FINNCppDriver/core/Accelerator.h b/src/FINNCppDriver/core/Accelerator.h
index 3055b42..b51bda6 100644
--- a/src/FINNCppDriver/core/Accelerator.h
+++ b/src/FINNCppDriver/core/Accelerator.h
@@ -165,17 +165,15 @@ namespace Finn {
          * @param forceArchival Whether or not to force a readout into archive. Necessary to get new data. Will be done automatically if a whole multiple of the buffer size is produced
          * @return std::vector<std::vector<uint8_t>>
          */
-        Finn::vector<uint8_t> getOutputData(unsigned int deviceIndex, const std::string& outputBufferKernelName, bool forceArchival);
+        Finn::vector<uint8_t> getOutputData(unsigned int deviceIndex, const std::string& outputBufferKernelName);
 
-        /**
-         * @brief Get the size of the buffer with the specified device index and buffer name
-         *
-         * @param ss
-         * @param deviceIndex
-         * @param bufferName
-         * @return std::size_t
-         */
-        std::size_t size(SIZE_SPECIFIER ss, unsigned int deviceIndex, const std::string& bufferName);
+        size_t getSizeInBytes(unsigned int deviceIndex, const std::string& bufferName);
+
+        size_t getFeatureMapSize(unsigned int deviceIndex, const std::string& bufferName);
+
+        size_t getBatchSize(unsigned int deviceIndex, const std::string& bufferName);
+
+        size_t getTotalDataSize(unsigned int deviceIndex, const std::string& bufferName);
     };
 
 
diff --git a/src/FINNCppDriver/core/BaseDriver.hpp b/src/FINNCppDriver/core/BaseDriver.hpp
index 9baacfd..d313b2f 100644
--- a/src/FINNCppDriver/core/BaseDriver.hpp
+++ b/src/FINNCppDriver/core/BaseDriver.hpp
@@ -15,12 +15,12 @@
 
 #include <FINNCppDriver/utils/ConfigurationStructs.h>
 #include <FINNCppDriver/utils/FinnUtils.h>
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
 #include <FINNCppDriver/utils/DataPacking.hpp>
 #include <FINNCppDriver/utils/DynamicMdSpan.hpp>
 #include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>  // for FINN_LOG, loglevel, ...
 #include <FINNCppDriver/utils/join.hpp>
 #include <bitset>
 #include <cinttypes>  // for uint8_t
@@ -48,14 +48,12 @@ namespace Finn {
     class BaseDriver {
         Accelerator accelerator;
         Config configuration;
-        logger_type& logger = Logger::getLogger();
 
         uint defaultInputDeviceIndex = 0;
         std::string defaultInputKernelName;
         uint defaultOutputDeviceIndex = 0;
         std::string defaultOutputKernelName;
         uint batchElements = 1;
-        bool forceAchieval = false;
 
         /**
          * @brief A logger prefix to determine the source of a log write
@@ -94,7 +92,7 @@ namespace Finn {
          * @param configPath
          * @param batchSize
          */
-        BaseDriver(const std::filesystem::path& configPath, uint batchSize) : configuration(createConfigFromPath(configPath)), logger(Logger::getLogger()) { initializeBaseDriver(batchSize); };
+        BaseDriver(const std::filesystem::path& configPath, uint batchSize) : configuration(createConfigFromPath(configPath)) { initializeBaseDriver(batchSize); };
 
         /**
          * @brief Create a new base driver based on an existing configuration
@@ -102,7 +100,7 @@ namespace Finn {
          * @param pConfig
          * @param batchSize
          */
-        BaseDriver(const Config& pConfig, uint batchSize) : configuration(pConfig), logger(Logger::getLogger()) { initializeBaseDriver(batchSize); }
+        BaseDriver(const Config& pConfig, uint batchSize) : configuration(pConfig) { initializeBaseDriver(batchSize); }
 
         /**
          * @brief Construct a new Base Driver object
@@ -114,10 +112,9 @@ namespace Finn {
          * @param outputDeviceIndex
          * @param outputKernelName
          * @param batchSize
-         * @param pForceAchieval
          */
-        BaseDriver(const std::filesystem::path& configPath, uint inputDeviceIndex, const std::string& inputKernelName, uint outputDeviceIndex, const std::string& outputKernelName, uint batchSize, bool pForceAchieval)
-            : configuration(createConfigFromPath(configPath)), logger(Logger::getLogger()), forceAchieval(pForceAchieval) {
+        BaseDriver(const std::filesystem::path& configPath, uint inputDeviceIndex, const std::string& inputKernelName, uint outputDeviceIndex, const std::string& outputKernelName, uint batchSize)
+            : configuration(createConfigFromPath(configPath)) {
             initializeBaseDriver(batchSize);
         }
 
@@ -131,12 +128,8 @@ namespace Finn {
          * @param outputDeviceIndex
          * @param outputKernelName
          * @param batchSize
-         * @param pForceAchieval
          */
-        BaseDriver(const Config& pConfig, uint inputDeviceIndex, const std::string& inputKernelName, uint outputDeviceIndex, const std::string& outputKernelName, uint batchSize, bool pForceAchieval)
-            : configuration(pConfig), logger(Logger::getLogger()), forceAchieval(pForceAchieval) {
-            initializeBaseDriver(batchSize);
-        }
+        BaseDriver(const Config& pConfig, uint inputDeviceIndex, const std::string& inputKernelName, uint outputDeviceIndex, const std::string& outputKernelName, uint batchSize) : configuration(pConfig) { initializeBaseDriver(batchSize); }
 
         /**
          * @brief Construct a new Base Driver object
@@ -211,13 +204,6 @@ namespace Finn {
          */
         uint getBatchSize() { return batchElements; }
 
-        /**
-         * @brief Set the Force Achieval
-         *
-         * @param force
-         */
-        void setForceAchieval(bool force) { forceAchieval = force; }
-
         /**
          * @brief Get the Config object. Simple getter to check things outside the driver
          *
@@ -242,16 +228,13 @@ namespace Finn {
          */
         std::shared_ptr<DeviceInputBuffer<uint8_t>> getInputBuffer(uint deviceIndex, const std::string& bufferName) { return getDeviceHandler(deviceIndex).getInputBuffer(bufferName); }
 
-        /**
-         * @brief Return the size (type specified by SIZE_SPECIFIER) at the given device at the given buffer
-         *
-         * @param ss
-         * @param deviceIndex
-         * @param bufferName
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss, uint deviceIndex, const std::string& bufferName) { return accelerator.size(ss, deviceIndex, bufferName); }
+        size_t getSizeInBytes(unsigned int deviceIndex, const std::string& bufferName) { return accelerator.getSizeInBytes(deviceIndex, bufferName); }
+
+        size_t getFeatureMapSize(unsigned int deviceIndex, const std::string& bufferName) { return accelerator.getFeatureMapSize(deviceIndex, bufferName); }
+
+        size_t getBatchSize(unsigned int deviceIndex, const std::string& bufferName) { return accelerator.getBatchSize(deviceIndex, bufferName); }
 
+        size_t getTotalDataSize(unsigned int deviceIndex, const std::string& bufferName) { return accelerator.getTotalDataSize(deviceIndex, bufferName); }
 
         /**
          * @brief Store input into the driver for asynchronous inference
@@ -265,14 +248,14 @@ namespace Finn {
          */
         template<typename IteratorType, typename = std::enable_if<!SynchronousInference>>
         void input(IteratorType first, IteratorType last, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint batchSize) {
-            FINN_LOG_DEBUG(logger, loglevel::info) << loggerPrefix() << "Store data for asynchronous inference.";
+            FINN_LOG_DEBUG(loglevel::info) << loggerPrefix() << "Store data for asynchronous inference.";
             auto packed = Finn::pack<F>(first, last);
             auto storeFunc = accelerator.storeFactory(inputDeviceIndex, inputBufferKernelName);
 
-            if (std::abs(std::distance(packed.begin(), packed.end())) != size(SIZE_SPECIFIER::FEATUREMAP_SIZE, inputDeviceIndex, inputBufferKernelName) * batchSize) {
+            if (std::abs(std::distance(packed.begin(), packed.end())) != getFeatureMapSize(inputDeviceIndex, inputBufferKernelName) * batchSize) {
                 FinnUtils::logAndError<std::runtime_error>("Input length (" + std::to_string(std::abs(std::distance(packed.begin(), packed.end()))) + ") does not match up with batches*inputsize_per_batch (" +
-                                                           std::to_string(size(SIZE_SPECIFIER::FEATUREMAP_SIZE, inputDeviceIndex, inputBufferKernelName)) + "*" + std::to_string(batchSize) + "=" +
-                                                           std::to_string(size(SIZE_SPECIFIER::FEATUREMAP_SIZE, inputDeviceIndex, inputBufferKernelName) * batchSize) + ")");
+                                                           std::to_string(getFeatureMapSize(inputDeviceIndex, inputBufferKernelName)) + "*" + std::to_string(batchSize) + "=" +
+                                                           std::to_string(getFeatureMapSize(inputDeviceIndex, inputBufferKernelName) * batchSize) + ")");
             }
 
             storeFunc(packed.begin(), packed.end());
@@ -297,13 +280,12 @@ namespace Finn {
          * @tparam V
          * @param outputDeviceIndex FPGA device from which data should be received
          * @param outputBufferKernelName Identifier of the output kernel
-         * @param forceArchival Should data be explicitly polled?
          * @return Finn::vector<V>
          */
         template<typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<!SynchronousInference>>
-        [[nodiscard]] Finn::vector<V> getResults(uint outputDeviceIndex, const std::string& outputBufferKernelName, bool forceArchival) {
+        [[nodiscard]] Finn::vector<V> getResults(uint outputDeviceIndex, const std::string& outputBufferKernelName) {
             // TODO(linusjun): maybe this method should block until data is available?
-            auto result = accelerator.getOutputData(outputDeviceIndex, outputBufferKernelName, forceArchival);
+            auto result = accelerator.getOutputData(outputDeviceIndex, outputBufferKernelName);
             return unpack<S, V>(result);
         }
 
@@ -317,7 +299,7 @@ namespace Finn {
         template<typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<!SynchronousInference>>
         [[nodiscard]] Finn::vector<V> getResults() {
             // TODO(linusjun): maybe this method should block until data is available?
-            auto result = accelerator.getOutputData(defaultOutputDeviceIndex, defaultOutputKernelName, forceAchieval);
+            auto result = accelerator.getOutputData(defaultOutputDeviceIndex, defaultOutputKernelName);
             return unpack<S, V>(result);
         }
 
@@ -333,12 +315,10 @@ namespace Finn {
          * @param inputBufferKernelName name of input kernel
          * @param outputDeviceIndex index of output FPGA
          * @param outputBufferKernelName name of output kernel
-         * @param forceArchival
          * @return Finn::vector<V>
          */
         template<typename IteratorType, typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<SynchronousInference>>
-        [[nodiscard]] Finn::vector<V> inferSynchronous(IteratorType first, IteratorType last, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName,
-                                                       bool forceArchival) {
+        [[nodiscard]] Finn::vector<V> inferSynchronous(IteratorType first, IteratorType last, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName) {
             using IterValueType = typename std::iterator_traits<IteratorType>::value_type;
             static auto foldedShape = static_cast<Finn::ExtendedBufferDescriptor*>(configuration.deviceWrappers[inputDeviceIndex].idmas[0].get())->foldedShape;
             foldedShape[0] = batchElements;
@@ -346,7 +326,7 @@ namespace Finn {
 
             auto packed = Finn::packMultiDimensionalInputs<F, IteratorType>(first, last, reshapedInput, foldedShape.back());
 
-            auto result = infer(packed.begin(), packed.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchElements, forceArchival);
+            auto result = infer(packed.begin(), packed.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchElements);
 
             static auto packedOutput = configuration.deviceWrappers[inputDeviceIndex].odmas[0]->packedShape;
             packedOutput[0] = batchElements;
@@ -370,7 +350,7 @@ namespace Finn {
          */
         template<typename IteratorType, typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<SynchronousInference>>
         [[nodiscard]] Finn::vector<V> inferSynchronous(IteratorType first, IteratorType last) {
-            return inferSynchronous(first, last, defaultInputDeviceIndex, defaultInputKernelName, defaultOutputDeviceIndex, defaultOutputKernelName, forceAchieval);
+            return inferSynchronous(first, last, defaultInputDeviceIndex, defaultInputKernelName, defaultOutputDeviceIndex, defaultOutputKernelName);
         }
 
         /**
@@ -384,12 +364,11 @@ namespace Finn {
          * @param inputBufferKernelName
          * @param outputDeviceIndex
          * @param outputBufferKernelName
-         * @param forceArchival
          * @return Finn::vector<V>
          */
         template<typename U, typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<SynchronousInference>>
-        [[nodiscard]] Finn::vector<V> inferSynchronous(const Finn::vector<U>& data, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName, bool forceArchival) {
-            return inferSynchronous(data.begin(), data.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchElements, forceArchival);
+        [[nodiscard]] Finn::vector<V> inferSynchronous(const Finn::vector<U>& data, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName) {
+            return inferSynchronous(data.begin(), data.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchElements);
         }
 
         /**
@@ -403,7 +382,7 @@ namespace Finn {
          */
         template<typename U, typename V = Finn::UnpackingAutoRetType::AutoRetType<S>, typename = std::enable_if<SynchronousInference>>
         [[nodiscard]] Finn::vector<V> inferSynchronous(const Finn::vector<U>& data) {
-            return inferSynchronous(data, defaultInputDeviceIndex, defaultInputKernelName, defaultOutputDeviceIndex, defaultOutputKernelName, batchElements, forceAchieval);
+            return inferSynchronous(data, defaultInputDeviceIndex, defaultInputKernelName, defaultOutputDeviceIndex, defaultOutputKernelName, batchElements);
         }
 
 
@@ -418,19 +397,17 @@ namespace Finn {
          * @param outputDeviceIndex
          * @param outputBufferKernelName
          * @param batchSize
-         * @param forceArchival If true, the data gets written to LTS either way, ensuring that there is data to be read!
          * @return Finn::vector<uint8_t>
          */
         template<typename IteratorType>
-        [[nodiscard]] Finn::vector<uint8_t> infer(IteratorType first, IteratorType last, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName, uint batchSize,
-                                                  bool forceArchival) {
-            FINN_LOG_DEBUG(logger, loglevel::info) << loggerPrefix() << "Starting inference (raw data)";
+        [[nodiscard]] Finn::vector<uint8_t> infer(IteratorType first, IteratorType last, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName, uint batchSize) {
+            FINN_LOG_DEBUG(loglevel::info) << loggerPrefix() << "Starting inference (raw data)";
             auto storeFunc = accelerator.storeFactory(inputDeviceIndex, inputBufferKernelName);
 
-            if (std::abs(std::distance(first, last)) != size(SIZE_SPECIFIER::TOTAL_DATA_SIZE, inputDeviceIndex, inputBufferKernelName)) {
+            if (std::abs(std::distance(first, last)) != getTotalDataSize(inputDeviceIndex, inputBufferKernelName)) {
                 FinnUtils::logAndError<std::runtime_error>(loggerPrefix() + " Input length (" + std::to_string(std::abs(std::distance(first, last))) + ") does not match up with batches*inputsize_per_batch (" +
-                                                           std::to_string(size(SIZE_SPECIFIER::FEATUREMAP_SIZE, inputDeviceIndex, inputBufferKernelName)) + "*" + std::to_string(batchSize) + "=" +
-                                                           std::to_string(size(SIZE_SPECIFIER::TOTAL_DATA_SIZE, inputDeviceIndex, inputBufferKernelName)) + ")");
+                                                           std::to_string(getFeatureMapSize(inputDeviceIndex, inputBufferKernelName)) + "*" + std::to_string(batchSize) + "=" +
+                                                           std::to_string(getTotalDataSize(inputDeviceIndex, inputBufferKernelName)) + ")");
             }
 
             bool stored = storeFunc(first, last);
@@ -439,13 +416,13 @@ namespace Finn {
 
 #ifdef UNITTEST
             Finn::vector<uint8_t> data(first, last);
-            FINN_LOG(logger, loglevel::info) << "Readback from device buffer confirming data was written to board successfully: " << isSyncedDataEquivalent(inputDeviceIndex, inputBufferKernelName, data);
+            FINN_LOG(loglevel::info) << "Readback from device buffer confirming data was written to board successfully: " << isSyncedDataEquivalent(inputDeviceIndex, inputBufferKernelName, data);
 #endif
             accelerator.wait();
 
-            FINN_LOG_DEBUG(logger, loglevel::info) << "Reading out buffers";
+            FINN_LOG_DEBUG(loglevel::info) << "Reading out buffers";
             accelerator.read();
-            return accelerator.getOutputData(outputDeviceIndex, outputBufferKernelName, forceArchival);
+            return accelerator.getOutputData(outputDeviceIndex, outputBufferKernelName);
         }
 
         /**
@@ -458,12 +435,10 @@ namespace Finn {
          * @param outputDeviceIndex
          * @param outputBufferKernelName
          * @param batchSize
-         * @param forceArchival If true, the data gets written to LTS either way, ensuring that there is data to be read!
          * @return Finn::vector<uint8_t>
          */
-        [[nodiscard]] Finn::vector<uint8_t> infer(const Finn::vector<uint8_t>& data, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName, uint batchSize,
-                                                  bool forceArchival) {
-            return infer(data.begin(), data.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchSize, forceArchival);
+        [[nodiscard]] Finn::vector<uint8_t> infer(const Finn::vector<uint8_t>& data, uint inputDeviceIndex, const std::string& inputBufferKernelName, uint outputDeviceIndex, const std::string& outputBufferKernelName, uint batchSize) {
+            return infer(data.begin(), data.end(), inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, batchSize);
         }
 
          protected:
@@ -488,22 +463,22 @@ namespace Finn {
          *
          */
         void logDriver() {
-            FINN_LOG(logger, loglevel::info) << loggerPrefix() << "Driver Overview:\n";
+            FINN_LOG(loglevel::info) << loggerPrefix() << "Driver Overview:\n";
             for (DeviceHandler& devHandler : accelerator) {
-                FINN_LOG(logger, loglevel::info) << "\tDevice Index: " << devHandler.getDeviceIndex();
+                FINN_LOG(loglevel::info) << "\tDevice Index: " << devHandler.getDeviceIndex();
                 for (auto& keyValuePair : devHandler.getInputBufferMap()) {
-                    FINN_LOG(logger, loglevel::info) << "\t\tInput buffers: ";
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tName: " << keyValuePair.second->getName() << " (in hashmap as " << keyValuePair.first << ")";
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tShape packed: " << FinnUtils::shapeToString(keyValuePair.second->getPackedShape());
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tElements of type T (usually uint8_t) per sample: " << keyValuePair.second->size(SIZE_SPECIFIER::FEATUREMAP_SIZE);
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tElements of type T (usually uint8_t) in buffer overall: " << keyValuePair.second->size(SIZE_SPECIFIER::TOTAL_DATA_SIZE);
+                    FINN_LOG(loglevel::info) << "\t\tInput buffers: ";
+                    FINN_LOG(loglevel::info) << "\t\t\tName: " << keyValuePair.second->getName() << " (in hashmap as " << keyValuePair.first << ")";
+                    FINN_LOG(loglevel::info) << "\t\t\tShape packed: " << FinnUtils::shapeToString(keyValuePair.second->getPackedShape());
+                    FINN_LOG(loglevel::info) << "\t\t\tElements of type T (usually uint8_t) per sample: " << keyValuePair.second->getFeatureMapSize();
+                    FINN_LOG(loglevel::info) << "\t\t\tElements of type T (usually uint8_t) in buffer overall: " << keyValuePair.second->getTotalDataSize();
                 }
                 for (auto& keyValuePair : devHandler.getOutputBufferMap()) {
-                    FINN_LOG(logger, loglevel::info) << "\t\tOutput buffers: ";
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tName: " << keyValuePair.second->getName() << " (in hashmap as " << keyValuePair.first << ")";
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tShape packed: " << FinnUtils::shapeToString(keyValuePair.second->getPackedShape());
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tElements of type T (usually uint8_t) per sample: " << keyValuePair.second->size(SIZE_SPECIFIER::FEATUREMAP_SIZE);
-                    FINN_LOG(logger, loglevel::info) << "\t\t\tElements of type T (usually uint8_t) in buffer overall: " << keyValuePair.second->size(SIZE_SPECIFIER::TOTAL_DATA_SIZE);
+                    FINN_LOG(loglevel::info) << "\t\tOutput buffers: ";
+                    FINN_LOG(loglevel::info) << "\t\t\tName: " << keyValuePair.second->getName() << " (in hashmap as " << keyValuePair.first << ")";
+                    FINN_LOG(loglevel::info) << "\t\t\tShape packed: " << FinnUtils::shapeToString(keyValuePair.second->getPackedShape());
+                    FINN_LOG(loglevel::info) << "\t\t\tElements of type T (usually uint8_t) per sample: " << keyValuePair.second->getFeatureMapSize();
+                    FINN_LOG(loglevel::info) << "\t\t\tElements of type T (usually uint8_t) in buffer overall: " << keyValuePair.second->getTotalDataSize();
                 }
             }
         }
diff --git a/src/FINNCppDriver/core/CMakeLists.txt b/src/FINNCppDriver/core/CMakeLists.txt
index 6572a9d..949d28a 100644
--- a/src/FINNCppDriver/core/CMakeLists.txt
+++ b/src/FINNCppDriver/core/CMakeLists.txt
@@ -4,15 +4,13 @@ add_library(finnc_core SHARED ${CORE_SRC})
 target_include_directories(finnc_core SYSTEM PUBLIC ${XRT_INCLUDE_DIRS} ${FINN_SRC_DIR})
 target_link_directories(finnc_core PUBLIC ${XRT_LIB_CORE_LOCATION})
 target_link_directories(finnc_core PUBLIC ${XRT_LIB_OCL_LOCATION})
-target_link_libraries(finnc_core PUBLIC finnc_options finnc_utils finn_config Threads::Threads OpenCL xrt_coreutil rt uuid ${Boost_LIBRARIES})
-target_link_directories(finnc_core PRIVATE ${BOOST_LIBRARYDIR})
+target_link_libraries(finnc_core PUBLIC finnc_options finn_config Threads::Threads OpenCL xrt_coreutil rt uuid)
 
 add_library(finnc_core_test SHARED ${CORE_SRC})
 target_include_directories(finnc_core_test SYSTEM PUBLIC ${XRT_MOCK_INCLUDE} ${FINN_SRC_DIR})
-target_link_libraries(finnc_core_test PUBLIC ${Boost_LIBRARIES} finnc_options finnc_utils finn_config xrt_mock)
+target_link_libraries(finnc_core_test PUBLIC finnc_options finn_config xrt_mock)
 target_compile_definitions(finnc_core_test PRIVATE UNITTEST=1)
 
-target_link_directories(finnc_core_test PRIVATE ${BOOST_LIBRARYDIR})
 # target_include_directories(finnc_core_test SYSTEM PUBLIC ${XRT_INCLUDE_DIRS})
 # target_link_directories(finnc_core_test PUBLIC ${XRT_LIB_CORE_LOCATION})
 # target_link_directories(finnc_core_test PUBLIC ${XRT_LIB_OCL_LOCATION})
diff --git a/src/FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp b/src/FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp
index dfd18f5..8391549 100644
--- a/src/FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp
+++ b/src/FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp
@@ -16,6 +16,7 @@
 #include <FINNCppDriver/utils/FinnUtils.h>
 
 #include <FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp>
+#include <FINNCppDriver/utils/SPSCQueue.hpp>
 #include <functional>
 #include <thread>
 
@@ -32,23 +33,24 @@ namespace Finn {
         template<typename T>
         class AsyncBufferWrapper {
              protected:
+            constexpr static size_t queueSize = 1024;  ///< Default size of the internal queue
             /**
-             * @brief Internal Ringbuffer used by all asynchronous buffers
+             * @brief Internal queue used by all asynchronous buffers
              *
              */
-            RingBuffer<T, true> ringBuffer;
+            SPSCQueue<T, queueSize> queue;
 
             /**
              * @brief Construct a new Async Buffer Wrapper object
              *
-             * @param ringBufferSizeFactor Number of batch elements that should be able to be stored
-             * @param elementsPerPart Number of values per batch element
+             * @param expectedMaxQueueSize Expected maximum size of the queue
              */
-            AsyncBufferWrapper(unsigned int ringBufferSizeFactor, std::size_t elementsPerPart) : ringBuffer(RingBuffer<T, true>(ringBufferSizeFactor, elementsPerPart)) {
-                if (ringBufferSizeFactor == 0) {
-                    FinnUtils::logAndError<std::runtime_error>("DeviceBuffer of size 0 cannot be constructed!");
+            AsyncBufferWrapper(std::size_t expectedMaxQueueSize) {
+                if (expectedMaxQueueSize > queueSize) {
+                    FINN_LOG(loglevel::warning) << "[AsyncDeviceBuffer] Expected maximum queue size (" << expectedMaxQueueSize << ") is larger than the async buffer queue size (" << queueSize
+                                                << "). This might lead to problems or performance issues. Consider increasing the queue size in the SPSCQueue template parameter.\n";
                 }
-                FINN_LOG(Logger::getLogger(), loglevel::info) << "[AsyncDeviceBuffer] Max buffer size:" << ringBufferSizeFactor << "*" << elementsPerPart << "\n";
+                FINN_LOG(loglevel::info) << "[AsyncDeviceBuffer] Max buffer size:" << queueSize << "\n";
             }
 
             /**
@@ -61,7 +63,7 @@ namespace Finn {
              *
              * @param buf
              */
-            AsyncBufferWrapper(AsyncBufferWrapper&& buf) noexcept : ringBuffer(std::move(buf.ringBuffer)) {}
+            AsyncBufferWrapper(AsyncBufferWrapper&& buf) noexcept {}
             /**
              * @brief Construct a new Async Buffer Wrapper object (Deleted Copy constructor)
              *
@@ -84,8 +86,16 @@ namespace Finn {
             AsyncBufferWrapper& operator=(const AsyncBufferWrapper& buf) = delete;
 #ifdef UNITTEST
              public:
-            RingBuffer<T, true>& testGetRingBuffer() { return this->ringBuffer; }
+            SPSCQueue<T, detail::AsyncBufferWrapper<T>::queueSize>& testGetQueue() { return this->queue; }
 #endif
+
+             public:
+            /**
+             * @brief Return the size of the buffer as specified by the argument.
+             *
+             * @return size_t
+             */
+            virtual size_t size() { return this->queue.size(); }
         };
     }  // namespace detail
 
@@ -105,15 +115,12 @@ namespace Finn {
          *
          */
         void runInternal(std::stop_token stoken) {
-            const std::size_t elementCount = this->ringBuffer.size(SIZE_SPECIFIER::FEATUREMAP_SIZE);
             while (!stoken.stop_requested()) {
-                if (!this->loadMap(stoken)) {  // blocks
-                    break;
-                }
-                this->sync(elementCount);
-                // this->execute(); TODO(linusjun): Fix all this shit!
+                this->sync(this->loadMap(stoken));
+                this->execute(this->shapePacked[0]);
+                // TODO(linusjun): Wait until kernel is done executing!
             }
-            FINN_LOG(this->logger, loglevel::info) << "Asynchronous Input buffer runner terminated";
+            FINN_LOG(loglevel::info) << "Asynchronous Input buffer runner terminated";
         }
 
          public:
@@ -128,8 +135,8 @@ namespace Finn {
          */
         AsyncDeviceInputBuffer(const std::string& pCUName, xrt::device& device, xrt::uuid& pDevUUID, const shapePacked_t& pShapePacked, unsigned int ringBufferSizeFactor)
             : DeviceInputBuffer<T>(pCUName, device, pDevUUID, pShapePacked),
-              detail::AsyncBufferWrapper<T>(ringBufferSizeFactor, FinnUtils::shapeToElements(pShapePacked)),
-              workerThread(std::jthread(std::bind_front(&AsyncDeviceInputBuffer::runInternal, this))){};
+              detail::AsyncBufferWrapper<T>(ringBufferSizeFactor * FinnUtils::shapeToElements(pShapePacked)),
+              workerThread(std::jthread(std::bind_front(&AsyncDeviceInputBuffer::runInternal, this))) {}
 
         /**
          * @brief Construct a new Async Device Input Buffer object
@@ -148,7 +155,7 @@ namespace Finn {
          *
          */
         ~AsyncDeviceInputBuffer() override {
-            FINN_LOG(this->logger, loglevel::info) << "Destructing Asynchronous input buffer";
+            FINN_LOG(loglevel::info) << "Destructing Asynchronous input buffer";
             workerThread.request_stop();  // Joining will be handled automatically by destruction
         };
         /**
@@ -166,14 +173,6 @@ namespace Finn {
          */
         AsyncDeviceInputBuffer& operator=(const AsyncDeviceInputBuffer& buf) = delete;
 
-        /**
-         * @brief Return the size of the buffer as specified by the argument. Bytes returns all bytes the buffer takes up, elements returns the number of T-values, numbers the number of F-values.
-         *
-         * @param ss
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss) override { return this->ringBuffer.size(ss); }
-
         /**
          * @brief Store the given data in the ring buffer
          *
@@ -181,19 +180,26 @@ namespace Finn {
          * @return true Store was successful
          * @return false Store failed
          */
-        bool store(std::span<const T> data) override { return this->ringBuffer.store(data.begin(), data.end()); }
+        bool store(std::span<const T> data) override {
+            if (this->queue.enqueue_bulk(data.data(), data.size()) == data.size()) {
+                FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "Stored " << data.size() << " elements in the ring buffer";
+                return true;
+            } else {
+                FINN_LOG_DEBUG(loglevel::error) << this->loggerPrefix() << "Failed to store data in the ring buffer.";
+                return false;
+            }
+        }
 
          protected:
         /**
          * @brief  Load data from the ring buffer into the memory map of the device.
          * @attention Invalidates the data that was moved to map
          *
-         * @return true
-         * @return false
+         * @return Number of bytes loaded into the map
          */
-        bool loadMap(std::stop_token stoken) {
-            FINN_LOG(this->logger, loglevel::info) << "Data transfer of input data to FPGA!\n";
-            return this->ringBuffer.read(this->map, stoken);
+        size_t loadMap(std::stop_token stoken) {
+            FINN_LOG_DEBUG(loglevel::info) << "Data transfer of input data to FPGA!\n";
+            return this->queue.dequeue_bulk(this->map, this->totalDataSize, stoken);
         }
 
         /**
@@ -216,25 +222,12 @@ namespace Finn {
         std::mutex ltsMutex;
         std::jthread workerThread;
 
-         private:
         void readInternal(std::stop_token stoken) {
-            FINN_LOG_DEBUG(this->logger, loglevel::info) << this->loggerPrefix() << "Starting to read from the device";
-            const std::size_t elementCount = this->ringBuffer.size(SIZE_SPECIFIER::FEATUREMAP_SIZE);
+            FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "Starting to read from the device";
             while (!stoken.stop_requested()) {
-                // auto outExecuteResult = execute();
-                // std::cout << outExecuteResult << "\n";
-                // if (outExecuteResult != ERT_CMD_STATE_COMPLETED && outExecuteResult != ERT_CMD_STATE_ERROR && outExecuteResult != ERT_CMD_STATE_ABORT) {
-                //     continue;
-                // }
-                // if (outExecuteResult == ERT_CMD_STATE_ERROR || outExecuteResult == ERT_CMD_STATE_ABORT) {
-                //     FINN_LOG(this->logger, loglevel::error) << "A problem has occured during the read process of the FPGA output.";
-                //     continue;
-                // }
-                this->sync(elementCount);
-                saveMap();
-                if (this->ringBuffer.full()) {  // TODO(linusjun): Allow registering of callback for this event?
-                    archiveValidBufferParts();
-                }
+                this->execute(this->shapePacked[0]);
+                this->sync(this->totalDataSize);
+                saveMap();  // TODO: Maybe the queue should have a callback that is called when the queue is full/data is avaible?
             }
         }
 
@@ -250,7 +243,7 @@ namespace Finn {
          */
         AsyncDeviceOutputBuffer(const std::string& pCUName, xrt::device& device, xrt::uuid& pDevUUID, const shapePacked_t& pShapePacked, unsigned int ringBufferSizeFactor)
             : DeviceOutputBuffer<T>(pCUName, device, pDevUUID, pShapePacked),
-              detail::AsyncBufferWrapper<T>(ringBufferSizeFactor, FinnUtils::shapeToElements(pShapePacked)),
+              detail::AsyncBufferWrapper<T>(ringBufferSizeFactor * FinnUtils::shapeToElements(pShapePacked)),
               workerThread(std::jthread(std::bind_front(&AsyncDeviceOutputBuffer::readInternal, this))){};
 
         /**
@@ -270,8 +263,9 @@ namespace Finn {
          *
          */
         ~AsyncDeviceOutputBuffer() override {
-            FINN_LOG(this->logger, loglevel::info) << "Destruction Asynchronous output buffer";
+            FINN_LOG(loglevel::info) << "Destruction Asynchronous output buffer";
             workerThread.request_stop();  // Joining will be handled automatically by destruction
+            this->queue.shutdown();       // Shutdown the queue to prevent further enqueues
         };
 
         /**
@@ -290,45 +284,6 @@ namespace Finn {
          */
         AsyncDeviceOutputBuffer& operator=(const AsyncDeviceOutputBuffer& buf) = delete;
 
-        /**
-         * @brief Return the size of the buffer as specified by the argument. Bytes returns all bytes the buffer takes up, elements returns the number of T-values, numbers the number of F-values.
-         *
-         * @param ss
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss) override { return this->ringBuffer.size(ss); }
-
-        /**
-         * @brief Put every valid read part of the ring buffer into the archive. This invalides them so that they are not put into the archive again.
-         * @note After the function is executed, all parts are invalid.
-         * @note This function can be executed manually instead of wait for it to be called by read() when the ring buffer is full.
-         *
-         */
-        void archiveValidBufferParts() {
-            std::lock_guard guard(ltsMutex);
-            this->longTermStorage.reserve(this->longTermStorage.size() + this->ringBuffer.size());
-            this->ringBuffer.readAllValidParts(std::back_inserter(this->longTermStorage));
-        }
-
-        /**
-         * @brief Return the archive.
-         *
-         * @return Finn::vector<T>
-         */
-        Finn::vector<T> getData() {
-            std::lock_guard guard(ltsMutex);
-            Finn::vector<T> tmp(this->longTermStorage);
-            clearArchive();
-            return tmp;
-        }
-
-        /**
-         * @brief Reserve enough storage for the expectedEntries number of entries. Note however that because this is a vec of vecs, this only allocates memory for the pointers, not the data itself.
-         *
-         * @param expectedEntries
-         */
-        void allocateLongTermStorage([[maybe_unused]] unsigned int expectedEntries) { this->longTermStorage.reserve(expectedEntries * this->ringBuffer.size(SIZE_SPECIFIER::FEATUREMAP_SIZE)); }
-
         /**
          * @brief Not supported by the AsyncDeviceOutputBuffer.
          *
@@ -344,21 +299,32 @@ namespace Finn {
          */
         bool run() override { return false; }
 
-         protected:
         /**
-         * @brief Store the contents of the memory map into the ring buffer.
+         *  @brief Return the data contained in the FPGA Buffer map.
          *
+         * @return Finn::vector<T>
          */
-        void saveMap() {
-            FINN_LOG(this->logger, loglevel::info) << "Data transfer of output from FPGA!\n";
-            this->ringBuffer.template store<T*>(this->map, this->ringBuffer.size(SIZE_SPECIFIER::FEATUREMAP_SIZE));
+        Finn::vector<T> getData() override {  // TODO(linusjun): Replace with a variant that takes a output iterator
+            Finn::vector<T> tmp(this->totalDataSize);
+            this->queue.dequeue_bulk(tmp.begin(), this->totalDataSize);
+            return tmp;
         }
 
+         protected:
         /**
-         * @brief Clear the archive of all it's entries
+         * @brief Store the contents of the memory map into the ring buffer.
          *
          */
-        void clearArchive() { this->longTermStorage.clear(); }
+        bool saveMap() {
+            FINN_LOG_DEBUG(loglevel::info) << "Data transfer of output from FPGA!\n";
+            if (this->queue.enqueue_bulk(this->map, this->totalDataSize) == this->totalDataSize) {
+                FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "Stored " << this->totalDataSize << " elements in the ring buffer";
+                return true;
+            } else {
+                FINN_LOG_DEBUG(loglevel::error) << this->loggerPrefix() << "Failed to store data in the ring buffer.";
+                return false;
+            }
+        }
     };
 }  // namespace Finn
 
diff --git a/src/FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp b/src/FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp
index e643e75..e95dfd0 100644
--- a/src/FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp
+++ b/src/FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp
@@ -14,11 +14,10 @@
 #define DEVICEBUFFER
 
 #include <FINNCppDriver/config/CompilationOptions.h>
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
-#include <FINNCppDriver/utils/RingBuffer.hpp>
-#include <boost/type_index.hpp>
+#include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <chrono>
 #include <future>
 #include <span>
@@ -92,11 +91,9 @@ namespace Finn {
          *
          */
         const long long bufAdr;
-        /**
-         * @brief Logger
-         *
-         */
-        logger_type& logger;
+
+        std::size_t totalDataSize;
+        std::size_t featureMapSize;
 
         void busyWait() {
             // Wait until the IP is DONE
@@ -138,16 +135,17 @@ namespace Finn {
               internalBo(xrt::bo(device, mapSize * sizeof(T), DeviceBuffer::getFlags(Finn::Options::hostMemoryAccess), 0)),
               map(internalBo.template map<T*>()),
               assocIPCore(xrt::ip(device, pDevUUID, pCUName)),  // Using xrt::kernel/getGroupId after this point leads to a total bricking of the FPGA card!!
-              bufAdr(internalBo.address()),
-              logger(Logger::getLogger()) {
+              bufAdr(internalBo.address()) {
             shapePacked[0] = batchSize;
-            FINN_LOG(logger, loglevel::info) << "[DeviceBuffer] "
-                                             << "New Device Buffer of size " << mapSize * sizeof(T) << "bytes with group id " << 0 << "\n";
-            FINN_LOG(logger, loglevel::info) << "[DeviceBuffer] "
-                                             << "Host Memory Access enabled: " << Finn::Options::hostMemoryAccess << "\n";
-            FINN_LOG(logger, loglevel::info) << "[DeviceBuffer] "
-                                             << "Initializing DeviceBuffer " << name << " (SHAPE PACKED: " << FinnUtils::shapeToString(pShapePacked) << " inputs of the given shape, MAP SIZE: " << mapSize << ")\n";
+            FINN_LOG(loglevel::info) << "[DeviceBuffer] "
+                                     << "New Device Buffer of size " << mapSize * sizeof(T) << "bytes with group id " << 0 << "\n";
+            FINN_LOG(loglevel::info) << "[DeviceBuffer] "
+                                     << "Host Memory Access enabled: " << Finn::Options::hostMemoryAccess << "\n";
+            FINN_LOG(loglevel::info) << "[DeviceBuffer] "
+                                     << "Initializing DeviceBuffer " << name << " (SHAPE PACKED: " << FinnUtils::shapeToString(pShapePacked) << " inputs of the given shape, MAP SIZE: " << mapSize << ")\n";
             std::fill(map, map + mapSize, 0);
+            totalDataSize = FinnUtils::shapeToElements(pShapePacked) * batchSize;
+            featureMapSize = totalDataSize / shapePacked[0];
         }
 
         /**
@@ -155,14 +153,7 @@ namespace Finn {
          * @param buf
          */
         DeviceBuffer(DeviceBuffer&& buf) noexcept
-            : name(std::move(buf.name)),
-              shapePacked(std::move(buf.shapePacked)),
-              mapSize(buf.mapSize),
-              internalBo(std::move(buf.internalBo)),
-              assocIPCore(std::move(buf.assocIPCore)),
-              map(std::move(buf.map)),
-              bufAdr(internalBo.address()),
-              logger(Logger::getLogger()) {}
+            : name(std::move(buf.name)), shapePacked(std::move(buf.shapePacked)), mapSize(buf.mapSize), internalBo(std::move(buf.internalBo)), assocIPCore(std::move(buf.assocIPCore)), map(std::move(buf.map)), bufAdr(internalBo.address()) {}
 
         /**
          * @brief Construct a new Device Buffer object (Deleted copy constructor)
@@ -175,7 +166,7 @@ namespace Finn {
          * @brief Destroy the Device Buffer object
          *
          */
-        virtual ~DeviceBuffer() { FINN_LOG(logger, loglevel::info) << "[DeviceBuffer] Destructing DeviceBuffer " << name << "\n"; };
+        virtual ~DeviceBuffer() { FINN_LOG(loglevel::info) << "[DeviceBuffer] Destructing DeviceBuffer " << name << "\n"; };
 
         /**
          * @brief Deleted move assignment operator
@@ -193,13 +184,13 @@ namespace Finn {
          */
         DeviceBuffer& operator=(const DeviceBuffer& buf) = delete;
 
-        /**
-         * @brief Returns a specific size parameter of DeviceBuffer. Size parameter selected with @see SIZE_SPECIFIER
-         *
-         * @param ss @see SIZE_SPECIFIER
-         * @return size_t
-         */
-        virtual size_t size(SIZE_SPECIFIER ss) = 0;
+        virtual size_t getSizeInBytes() { return totalDataSize * sizeof(T); }
+
+        virtual size_t getFeatureMapSize() { return featureMapSize; }
+
+        virtual size_t getBatchSize() { return this->shapePacked[0]; }
+
+        virtual size_t getTotalDataSize() { return totalDataSize; }
 
         /**
          * @brief Get the name of the device buffer
@@ -234,7 +225,7 @@ namespace Finn {
          *
          * @return std::string
          */
-        virtual std::string loggerPrefix() { return "[" + boost::typeindex::type_id<decltype(*this)>().pretty_name() + " - " + name + "] "; }
+        virtual std::string loggerPrefix() { return "[" + std::string(Finn::type_name<decltype(*this)>()) + " - " + name + "] "; }
 
         /**
          * @brief Synchronizes the Buffer data to the data on the FPGA
@@ -358,11 +349,6 @@ namespace Finn {
          *
          */
         const IO ioMode = IO::OUTPUT;
-        /**
-         * @brief Data storage until data is requested by user
-         *
-         */
-        Finn::vector<T> longTermStorage;
         /**
          * @brief Timeout for kernels
          *
@@ -424,9 +410,7 @@ namespace Finn {
 
         void testSetMap(const Finn::vector<T>& data) { testSetMap(data.begin(), data.end()); }
 
-        unsigned int testGetLongTermStorageSize() const { return longTermStorage.size(); }
         xrt::bo& testGetInternalBO() { return this->interalBo; }
-        Finn::vector<T>& testGetLTS() { return longTermStorage; }
 #endif
     };
 }  // namespace Finn
diff --git a/src/FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp b/src/FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp
index f6ea244..3252afc 100644
--- a/src/FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp
+++ b/src/FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp
@@ -21,6 +21,9 @@
 namespace Finn {
     template<typename T>
     class SyncDeviceInputBuffer : public DeviceInputBuffer<T> {
+         private:
+        friend class DeviceInputBuffer<T>;
+
          public:
         /**
          * @brief Construct a new Sync Device Input Buffer object
@@ -32,8 +35,8 @@ namespace Finn {
          * @param batchSize batch size
          */
         SyncDeviceInputBuffer(const std::string& pCUName, xrt::device& device, xrt::uuid& pDevUUID, const shapePacked_t& pShapePacked, unsigned int batchSize) : DeviceInputBuffer<T>(pCUName, device, pDevUUID, pShapePacked, batchSize) {
-            FINN_LOG(this->logger, loglevel::info) << "[SyncDeviceInputBuffer] "
-                                                   << "Initializing DeviceBuffer " << this->name << " (SHAPE PACKED: " << FinnUtils::shapeToString(pShapePacked) << " inputs of the given shape, MAP SIZE: " << this->mapSize << ")\n";
+            FINN_LOG(loglevel::info) << "[SyncDeviceInputBuffer] "
+                                     << "Initializing DeviceBuffer " << this->name << " (SHAPE PACKED: " << FinnUtils::shapeToString(pShapePacked) << " inputs of the given shape, MAP SIZE: " << this->mapSize << ")\n";
             this->shapePacked[0] = batchSize;
         };
 
@@ -79,29 +82,6 @@ namespace Finn {
          protected:
 #endif
 
-         private:
-        friend class DeviceInputBuffer<T>;
-
-         public:
-        size_t size(SIZE_SPECIFIER ss) override {
-            switch (ss) {
-                case SIZE_SPECIFIER::BYTES: {
-                    return FinnUtils::shapeToElements(this->shapePacked) * sizeof(T);
-                }
-                case SIZE_SPECIFIER::FEATUREMAP_SIZE: {
-                    return FinnUtils::shapeToElements(this->shapePacked) / this->shapePacked[0];
-                }
-                case SIZE_SPECIFIER::BATCHSIZE: {
-                    return this->shapePacked[0];
-                }
-                case SIZE_SPECIFIER::TOTAL_DATA_SIZE: {
-                    return FinnUtils::shapeToElements(this->shapePacked);
-                }
-                default:
-                    return 0;
-            }
-        }
-
         /**
          * @brief Store the given data in the input map of the FPGA
          *
@@ -121,7 +101,7 @@ namespace Finn {
          * @return false
          */
         bool run() override {
-            FINN_LOG_DEBUG(this->logger, loglevel::info) << this->loggerPrefix() << "DeviceBuffer (" << this->name << ") executing...";
+            FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "DeviceBuffer (" << this->name << ") executing...";
             this->sync(FinnUtils::shapeToElements(this->shapePacked));
             this->execute(this->shapePacked[0]);
             return true;
@@ -135,9 +115,6 @@ namespace Finn {
      */
     template<typename T>
     class SyncDeviceOutputBuffer : public DeviceOutputBuffer<T> {
-         private:
-        std::size_t elementCount;
-
          public:
         /**
          * @brief Construct a new Synchronous Device Output Buffer object
@@ -148,9 +125,8 @@ namespace Finn {
          * @param pShapePacked packed shape of input
          * @param ringBufferSizeFactor size of ringbuffer in input elements (batch elements)
          */
-        SyncDeviceOutputBuffer(const std::string& pCUName, xrt::device& device, xrt::uuid& pDevUUID, const shapePacked_t& pShapePacked, unsigned int batchSize) : DeviceOutputBuffer<T>(pCUName, device, pDevUUID, pShapePacked) {
+        SyncDeviceOutputBuffer(const std::string& pCUName, xrt::device& device, xrt::uuid& pDevUUID, const shapePacked_t& pShapePacked, unsigned int batchSize) : DeviceOutputBuffer<T>(pCUName, device, pDevUUID, pShapePacked, batchSize) {
             this->shapePacked[0] = batchSize;
-            elementCount = FinnUtils::shapeToElements(this->shapePacked);
         };
 
         /**
@@ -185,38 +161,13 @@ namespace Finn {
          */
         SyncDeviceOutputBuffer& operator=(const SyncDeviceOutputBuffer& buf) = delete;
 
-        /**
-         * @brief Return the size of the buffer as specified by the argument. Bytes returns all bytes the buffer takes up, elements returns the number of T-values, numbers the number of F-values.
-         *
-         * @param ss
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss) override {
-            switch (ss) {
-                case SIZE_SPECIFIER::BYTES: {
-                    return elementCount * sizeof(T);
-                }
-                case SIZE_SPECIFIER::FEATUREMAP_SIZE: {
-                    return elementCount / this->shapePacked[0];
-                }
-                case SIZE_SPECIFIER::BATCHSIZE: {
-                    return this->shapePacked[0];
-                }
-                case SIZE_SPECIFIER::TOTAL_DATA_SIZE: {
-                    return elementCount;
-                }
-                default:
-                    return 0;
-            }
-        }
-
         /**
          * @brief Return the data contained in the FPGA Buffer map.
          *
          * @return Finn::vector<T>
          */
         Finn::vector<T> getData() override {
-            Finn::vector<T> tmp(this->map, this->map + elementCount);
+            Finn::vector<T> tmp(this->map, this->map + this->totalDataSize);
             return tmp;
         }
 
@@ -227,7 +178,7 @@ namespace Finn {
          * @return false
          */
         bool run() override {
-            FINN_LOG_DEBUG(this->logger, loglevel::info) << this->loggerPrefix() << "DeviceBuffer (" << this->name << ") executing...";
+            FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "DeviceBuffer (" << this->name << ") executing...";
             this->execute(this->shapePacked[0]);
             return true;
         }
@@ -238,8 +189,8 @@ namespace Finn {
          * @return bool
          */
         bool read() override {
-            FINN_LOG_DEBUG(this->logger, loglevel::info) << this->loggerPrefix() << "Synching  " << elementCount << " bytes from the device";
-            this->sync(elementCount);
+            FINN_LOG_DEBUG(loglevel::info) << this->loggerPrefix() << "Synching  " << this->totalDataSize << " bytes from the device";
+            this->sync(this->totalDataSize);
             return true;
         }
     };
diff --git a/src/FINNCppDriver/core/DeviceHandler.cpp b/src/FINNCppDriver/core/DeviceHandler.cpp
index aba0c35..5c7b8d6 100644
--- a/src/FINNCppDriver/core/DeviceHandler.cpp
+++ b/src/FINNCppDriver/core/DeviceHandler.cpp
@@ -11,14 +11,13 @@
  */
 
 #include <FINNCppDriver/core/DeviceHandler.h>
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
 #include <FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp>
 #include <FINNCppDriver/core/DeviceBuffer/DeviceBuffer.hpp>
 #include <FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <algorithm>  // for copy
-#include <boost/cstdint.hpp>
 #include <cerrno>
 #include <chrono>
 #include <filesystem>  // for path
@@ -44,7 +43,7 @@ namespace Finn {
         initializeDevice();
         loadXclbinSetUUID();
         initializeBufferObjects(devWrap, hostBufferSize, pSynchronousInference);
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Finished setting up device " << xrtDeviceIndex;
+        FINN_LOG(loglevel::info) << loggerPrefix() << "Finished setting up device " << xrtDeviceIndex;
     }
 
     std::string DeviceHandler::loggerPrefix() { return "[DeviceHandler] "; }
@@ -83,20 +82,20 @@ namespace Finn {
     }
 
     void DeviceHandler::initializeDevice() {
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                      << "Initializing xrt::device, loading xclbin and assigning IP\n";
+        FINN_LOG(loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                 << "Initializing xrt::device, loading xclbin and assigning IP\n";
         device = xrt::device(xrtDeviceIndex);
     }
 
     void DeviceHandler::loadXclbinSetUUID() {
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                      << "Loading XCLBIN and setting uuid\n";
+        FINN_LOG(loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                 << "Loading XCLBIN and setting uuid\n";
         uuid = device.load_xclbin(xclbinPath);
     }
 
     void DeviceHandler::initializeBufferObjects(const DeviceWrapper& devWrap, unsigned int hostBufferSize, bool pSynchronousInference) {
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                      << "Initializing buffer objects with buffer size " << hostBufferSize << "\n";
+        FINN_LOG(loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                 << "Initializing buffer objects with buffer size " << hostBufferSize << "\n";
         for (auto&& ebdptr : devWrap.idmas) {
             if (pSynchronousInference) {
                 inputBufferMap.emplace(std::make_pair(ebdptr->kernelName, std::make_shared<Finn::SyncDeviceInputBuffer<uint8_t>>(ebdptr->kernelName, device, uuid, ebdptr->packedShape, hostBufferSize)));
@@ -110,11 +109,10 @@ namespace Finn {
                 outputBufferMap.emplace(std::make_pair(ebdptr->kernelName, ptr));
             } else {
                 auto ptr = std::make_shared<Finn::AsyncDeviceOutputBuffer<uint8_t>>(ebdptr->kernelName, device, uuid, ebdptr->packedShape, hostBufferSize);
-                ptr->allocateLongTermStorage(hostBufferSize * 5);
                 outputBufferMap.emplace(std::make_pair(ebdptr->kernelName, ptr));
             }
         }
-        FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Finished initializing buffer objects on device " << xrtDeviceIndex;
+        FINN_LOG(loglevel::info) << loggerPrefix() << "Finished initializing buffer objects on device " << xrtDeviceIndex;
 
 #ifndef NDEBUG
         isBufferMapCollisionFree();
@@ -127,8 +125,8 @@ namespace Finn {
         if (this->batchsize == pBatchsize) {
             return;
         } else {
-            FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                          << "Change batch size to " << pBatchsize << "\n";
+            FINN_LOG(loglevel::info) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                     << "Change batch size to " << pBatchsize << "\n";
             this->batchsize = pBatchsize;
             inputBufferMap.clear();
             outputBufferMap.clear();
@@ -194,25 +192,48 @@ namespace Finn {
     }
 
 
-    [[maybe_unused]] Finn::vector<uint8_t> DeviceHandler::retrieveResults(const std::string& outputBufferKernelName, bool forceArchival) {
+    [[maybe_unused]] Finn::vector<uint8_t> DeviceHandler::retrieveResults(const std::string& outputBufferKernelName) {
         if (!outputBufferMap.contains(outputBufferKernelName)) {
             auto newlineFold = [](std::string a, const auto& b) { return std::move(a) + '\n' + std::move(b.first); };
             std::string existingNames = "Existing buffer names:";
             std::accumulate(inputBufferMap.begin(), inputBufferMap.end(), existingNames, newlineFold);
             FinnUtils::logAndError<std::runtime_error>(loggerPrefix() + " [retrieve] Tried accessing kernel/buffer with name " + outputBufferKernelName + " but this kernel / buffer does not exist! " + existingNames);
         }
-        if (forceArchival) {
-            // TODO(linusjun): Fix for asynchronous inference
-            // outputBufferMap.at(outputBufferKernelName)->archiveValidBufferParts();
-        }
         return outputBufferMap.at(outputBufferKernelName)->getData();
     }
 
-    size_t DeviceHandler::size(SIZE_SPECIFIER ss, const std::string& bufferName) {
+    size_t DeviceHandler::getSizeInBytes(const std::string& bufferName) {
+        if (inputBufferMap.contains(bufferName)) {
+            return inputBufferMap.at(bufferName)->getSizeInBytes();
+        } else if (outputBufferMap.contains(bufferName)) {
+            return outputBufferMap.at(bufferName)->getSizeInBytes();
+        }
+        return 0;
+    }
+
+    size_t DeviceHandler::getFeatureMapSize(const std::string& bufferName) {
+        if (inputBufferMap.contains(bufferName)) {
+            return inputBufferMap.at(bufferName)->getFeatureMapSize();
+        } else if (outputBufferMap.contains(bufferName)) {
+            return outputBufferMap.at(bufferName)->getFeatureMapSize();
+        }
+        return 0;
+    }
+
+    size_t DeviceHandler::getBatchSize(const std::string& bufferName) {
+        if (inputBufferMap.contains(bufferName)) {
+            return inputBufferMap.at(bufferName)->getBatchSize();
+        } else if (outputBufferMap.contains(bufferName)) {
+            return outputBufferMap.at(bufferName)->getBatchSize();
+        }
+        return 0;
+    }
+
+    size_t DeviceHandler::getTotalDataSize(const std::string& bufferName) {
         if (inputBufferMap.contains(bufferName)) {
-            return inputBufferMap.at(bufferName)->size(ss);
+            return inputBufferMap.at(bufferName)->getTotalDataSize();
         } else if (outputBufferMap.contains(bufferName)) {
-            return outputBufferMap.at(bufferName)->size(ss);
+            return outputBufferMap.at(bufferName)->getTotalDataSize();
         }
         return 0;
     }
@@ -223,15 +244,15 @@ namespace Finn {
         bool collisionFound = false;
         for (size_t index = 0; index < inputBufferMap.bucket_count(); ++index) {
             if (inputBufferMap.bucket_size(index) > 1) {
-                FINN_LOG_DEBUG(Logger::getLogger(), loglevel::error) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                                     << "Hash collision in inputBufferMap. This access to the inputBufferMap is no longer constant time!";
+                FINN_LOG_DEBUG(loglevel::error) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                                << "Hash collision in inputBufferMap. This access to the inputBufferMap is no longer constant time!";
                 collisionFound = true;
             }
         }
         for (size_t index = 0; index < outputBufferMap.bucket_count(); ++index) {
             if (outputBufferMap.bucket_size(index) > 1) {
-                FINN_LOG_DEBUG(Logger::getLogger(), loglevel::error) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
-                                                                     << "Hash collision in outputBufferMap. This access to the outputBufferMap is no longer constant time!";
+                FINN_LOG_DEBUG(loglevel::error) << loggerPrefix() << "(" << xrtDeviceIndex << ") "
+                                                << "Hash collision in outputBufferMap. This access to the outputBufferMap is no longer constant time!";
                 collisionFound = true;
             }
         }
diff --git a/src/FINNCppDriver/core/DeviceHandler.h b/src/FINNCppDriver/core/DeviceHandler.h
index 5050f33..6f21ccc 100644
--- a/src/FINNCppDriver/core/DeviceHandler.h
+++ b/src/FINNCppDriver/core/DeviceHandler.h
@@ -127,7 +127,7 @@ namespace Finn {
          * @brief Destroy the Device Handler object
          *
          */
-        ~DeviceHandler() { FINN_LOG(Logger::getLogger(), loglevel::info) << loggerPrefix() << "Tearing down DeviceHandler\n"; };
+        ~DeviceHandler() { FINN_LOG(loglevel::info) << loggerPrefix() << "Tearing down DeviceHandler\n"; };
 
         /**
          * @brief Sets the input batch size. Needs to reinitialize all buffers!
@@ -199,19 +199,17 @@ namespace Finn {
          * @brief Read from the output buffer on the host. This does NOT execute the output kernel
          *
          * @param outputBufferKernelName
-         * @param forceArchival If true, the data gets copied from the buffer to the long term storage immediately. If false, the newest read data might not actually be returned by this function
          * @return Finn::vector<uint8_t>
          */
-        Finn::vector<uint8_t> retrieveResults(const std::string& outputBufferKernelName, bool forceArchival);
+        Finn::vector<uint8_t> retrieveResults(const std::string& outputBufferKernelName);
 
-        /**
-         * @brief Return the buffer sizes
-         *
-         * @param ss
-         * @param bufferName
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss, const std::string& bufferName);
+        size_t getSizeInBytes(const std::string& bufferName);
+
+        size_t getFeatureMapSize(const std::string& bufferName);
+
+        size_t getBatchSize(const std::string& bufferName);
+
+        size_t getTotalDataSize(const std::string& bufferName);
 
         /**
          * @brief Return whether there is a kernel with the given name in this device
diff --git a/src/FINNCppDriver/utils/CMakeLists.txt b/src/FINNCppDriver/utils/CMakeLists.txt
index d750eee..31bb65c 100644
--- a/src/FINNCppDriver/utils/CMakeLists.txt
+++ b/src/FINNCppDriver/utils/CMakeLists.txt
@@ -1,8 +1,7 @@
 file(GLOB_RECURSE UTILS_SRC *.cpp)
 
 add_library(finnc_utils SHARED ${UTILS_SRC})
-target_link_libraries(finnc_utils PUBLIC finnc_options ${Boost_LIBRARIES} nlohmann_json::nlohmann_json)
-target_link_directories(finnc_utils PRIVATE ${BOOST_LIBRARYDIR})
+target_link_libraries(finnc_utils PUBLIC finnc_options nlohmann_json::nlohmann_json)
 target_include_directories(finnc_utils PRIVATE ${FINN_SRC_DIR})
 set_target_properties(finnc_utils
     PROPERTIES
diff --git a/src/FINNCppDriver/utils/FinnUtils.h b/src/FINNCppDriver/utils/FinnUtils.h
index f548be6..963c68e 100644
--- a/src/FINNCppDriver/utils/FinnUtils.h
+++ b/src/FINNCppDriver/utils/FinnUtils.h
@@ -13,10 +13,11 @@
 #ifndef FINN_UTILS_H
 #define FINN_UTILS_H
 
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
+#include <FINNCppDriver/utils/Logger.hpp>  // for FINN_LOG, loglevel, ...
 #include <algorithm>
+#include <array>
 #include <bit>
 #include <cmath>
 #include <concepts>
@@ -193,10 +194,8 @@ namespace FinnUtils {
 
     /**
      * @brief Put some newlines into the log script for clearer reading
-     *
-     * @param logger
      */
-    inline void logSpacer(logger_type& logger) { FINN_LOG(logger, loglevel::info) << "\n\n\n\n"; }
+    inline void logSpacer() { FINN_LOG(loglevel::info) << "\n\n\n\n"; }
 
     /**
      * @brief Calculates the number of elements in a tensor given its shape.
@@ -265,7 +264,7 @@ namespace FinnUtils {
      */
     template<typename E>
     [[noreturn]] void logAndError(const std::string& msg) {
-        FINN_LOG(Logger::getLogger(), loglevel::error) << msg;
+        FINN_LOG(loglevel::error) << msg;
         throw E(msg);
     }
 
diff --git a/src/FINNCppDriver/utils/Logger.cpp b/src/FINNCppDriver/utils/Logger.cpp
deleted file mode 100644
index 95f1a34..0000000
--- a/src/FINNCppDriver/utils/Logger.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * @file Logger.cpp
- * @author Linus Jungemann (linus.jungemann@uni-paderborn.de) and others
- * @brief Provides a easy to use logger for the FINN driver
- * @version 0.1
- * @date 2023-10-31
- *
- * @copyright Copyright (c) 2023
- * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
- *
- */
-
-#include "Logger.h"
-
-#include <boost/core/enable_if.hpp>                       // for lazy_enable...
-#include <boost/exception/exception.hpp>                  // for exception
-#include <boost/log/core/core.hpp>                        // IWYU pragma: keep
-#include <boost/log/core/record.hpp>                      // IWYU pragma: keep
-#include <boost/log/detail/attachable_sstream_buf.hpp>    // IWYU pragma: keep
-#include <boost/log/detail/attr_output_impl.hpp>          // IWYU pragma: keep
-#include <boost/log/expressions/formatter.hpp>            // IWYU pragma: keep
-#include <boost/log/keywords/auto_flush.hpp>              // IWYU pragma: keep
-#include <boost/log/keywords/file_name.hpp>               // IWYU pragma: keep
-#include <boost/log/keywords/format.hpp>                  // IWYU pragma: keep
-#include <boost/log/keywords/rotation_size.hpp>           // IWYU pragma: keep
-#include <boost/log/keywords/severity.hpp>                // IWYU pragma: keep
-#include <boost/log/keywords/time_based_rotation.hpp>     // IWYU pragma: keep
-#include <boost/log/sinks/sync_frontend.hpp>              // IWYU pragma: keep
-#include <boost/log/sinks/text_file_backend.hpp>          // IWYU pragma: keep
-#include <boost/log/sources/record_ostream.hpp>           // IWYU pragma: keep
-#include <boost/log/utility/setup/common_attributes.hpp>  // IWYU pragma: keep
-#include <boost/log/utility/setup/console.hpp>            // IWYU pragma: keep
-#include <boost/log/utility/setup/formatter_parser.hpp>   // IWYU pragma: keep
-#include <boost/parameter/keyword.hpp>                    // for keyword
-#include <boost/smart_ptr/make_shared_object.hpp>         // for make_shared
-#include <boost/smart_ptr/shared_ptr.hpp>                 // for shared_ptr
-#include <boost/thread/exceptions.hpp>                    // for thread_inte...
-#include <iostream>                                       // for streamsize
-
-/**
- * @brief Abbrieviation for boost logging type
- *
- */
-using backend_type = bl::sinks::text_file_backend;
-/**
- * @brief Abbrieviation for boost logging type
- *
- */
-using sink_type = bl::sinks::synchronous_sink<backend_type>;
-namespace kw = bl::keywords;
-
-// NOLINTBEGIN
-#ifdef NDEBUG
-DevNull dev_null;
-#endif  // NDEBUG
-// NOLINTEND
-
-namespace Details {
-    /**
-     * @brief Global logger object. DO NOT ACCESS DIRECTLY!
-     *
-     */
-    // NOLINTNEXTLINE
-    logger_type boostLogger;
-}  // namespace Details
-
-// cppcheck-suppress unusedFunction
-logger_type& Logger::getLogger(bool console) {
-    static Logger log(console);
-    return Details::boostLogger;
-}
-
-Logger::Logger(bool console) {
-    auto backend = boost::make_shared<backend_type>(kw::file_name = "finnLog_%N.log", kw::rotation_size = 10 * 1024 * 1024, kw::time_based_rotation = bl::sinks::file::rotation_at_time_point(0, 0, 0), kw::auto_flush = true);
-
-    auto sink = boost::make_shared<sink_type>(backend);
-    sink->set_formatter(bl::parse_formatter(logFormat));
-
-    bl::core::get()->add_sink(sink);
-    initLogging(console);
-}
-
-void Logger::initLogging(bool console) {
-    static bool init = false;
-    if (!init) {
-        init = !init;
-        bl::register_simple_formatter_factory<bl::trivial::severity_level, char>("Severity");
-        boost::log::add_common_attributes();
-
-        if (console)
-            bl::add_console_log(std::clog, bl::keywords::format = logFormat);
-        return;
-    }
-    BOOST_LOG_SEV(Details::boostLogger, bl::trivial::warning) << "Do not init the logger more than once!";
-}
\ No newline at end of file
diff --git a/src/FINNCppDriver/utils/Logger.h b/src/FINNCppDriver/utils/Logger.hpp
similarity index 50%
rename from src/FINNCppDriver/utils/Logger.h
rename to src/FINNCppDriver/utils/Logger.hpp
index e2c43ca..0cd3ef4 100644
--- a/src/FINNCppDriver/utils/Logger.h
+++ b/src/FINNCppDriver/utils/Logger.hpp
@@ -2,10 +2,10 @@
  * @file Logger.h
  * @author Linus Jungemann (linus.jungemann@uni-paderborn.de) and others
  * @brief Provides a easy to use logger for the FINN driver
- * @version 0.1
+ * @version 0.2
  * @date 2023-10-31
  *
- * @copyright Copyright (c) 2023
+ * @copyright Copyright (c) 2023-2025
  * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
  *
  */
@@ -13,40 +13,26 @@
 #ifndef LOGGING_H
 #define LOGGING_H
 
-/**
- * @brief Define boost logging to be linked dynamically
- *
- */
-// NOLINTNEXTLINE
-#define BOOST_LOG_DYN_LINK 1
+#include <plog/Appenders/ColorConsoleAppender.h>
+#include <plog/Appenders/RollingFileAppender.h>
+#include <plog/Formatters/TxtFormatter.h>
+#include <plog/Init.h>
+#include <plog/Log.h>
 
-// IWYU pragma: no_include <FINNCppDriver/utils/Logger.h>
-#include <boost/log/detail/config.hpp>                // IWYU pragma: keep
-#include <boost/log/sources/severity_feature.hpp>     // IWYU pragma: keep
-#include <boost/log/sources/severity_logger.hpp>      // IWYU pragma: keep
-#include <boost/log/trivial.hpp>                      // IWYU pragma: keep
-#include <boost/smart_ptr/intrusive_ptr.hpp>          // IWYU pragma: keep
-#include <boost/smart_ptr/intrusive_ref_counter.hpp>  // IWYU pragma: keep
-#include <string>                                     // for allocator, string
+#include <string>  // for allocator, string
 
-namespace bl = boost::log;
-namespace loglevel = bl::trivial;
+namespace loglevel = plog;
 
 /**
- * @brief Abrieviation of boost logging type
- *
- */
-using logger_type = bl::sources::severity_logger<bl::trivial::severity_level>;
-
-/**
- * @brief redefine Boost Logger for FINN
+ * @brief Redefine plog logging macros for FINN
  *
  */
 // NOLINTBEGIN
-#define FINN_LOG(LOGGER, SEV) BOOST_LOG_SEV(LOGGER, SEV)
+#define FINN_LOG(SEV) PLOG(SEV)
 #ifdef NDEBUG
-extern class [[maybe_unused]] DevNull {
-} dev_null;
+class [[maybe_unused]] DevNull {};
+
+static DevNull dev_null;
 
 template<typename T>
 DevNull& operator<<(DevNull& dest, [[maybe_unused]] T) {
@@ -56,28 +42,23 @@ DevNull& operator<<(DevNull& dest, [[maybe_unused]] T) {
      * @brief Defines debug logging macro that is removed when building in Release mode
      *
      */
-    #define FINN_LOG_DEBUG(LOGGER, SEV) dev_null
+    #define FINN_LOG_DEBUG(SEV) dev_null
 #else
     /**
      * @brief Defines debug logging macro that is removed when building in Release mode
      *
      */
-    #define FINN_LOG_DEBUG(LOGGER, SEV) FINN_LOG(LOGGER, SEV)
+    #define FINN_LOG_DEBUG(SEV) FINN_LOG(SEV)
 #endif  // NDEBUG
         // NOLINTEND
 
 /**
- * @brief Singleton class that provides logger functionality for the driver. Based on the boost severity logger
+ * @brief Singleton class that provides logger functionality for the driver.
  *
  */
 class Logger {
      public:
-    /**
-     * @brief Get the Logger object
-     *
-     * @return logger_type&
-     */
-    static logger_type& getLogger(bool console = false);
+    void static initLogger(bool console = false) { static Logger log(console); }
 
     /**
      * @brief Construct a new Logger object (Deleted)
@@ -108,8 +89,15 @@ class Logger {
     Logger(Logger&&) = default;
 
      private:
-    void initLogging(bool console = false);
-    Logger(bool console = false);
+    Logger(bool console = false) {
+        static plog::RollingFileAppender<plog::TxtFormatter> fileAppender("finnLog.log", 10 * 1024 * 1024, 3);
+        static plog::ColorConsoleAppender<plog::TxtFormatter> consoleAppender;
+        if (console) {
+            plog::init(plog::debug, &fileAppender).addAppender(&consoleAppender);
+        } else {
+            plog::init(plog::debug, &fileAppender);
+        }
+    }
     const std::string logFormat = "[%TimeStamp%] (%LineID%) [%Severity%]: %Message%";
 };
 
diff --git a/src/FINNCppDriver/utils/RingBuffer.hpp b/src/FINNCppDriver/utils/RingBuffer.hpp
deleted file mode 100644
index 373c4b6..0000000
--- a/src/FINNCppDriver/utils/RingBuffer.hpp
+++ /dev/null
@@ -1,375 +0,0 @@
-/**
- * @file RingBuffer.hpp
- * @author Bjarne Wintermann (bjarne.wintermann@uni-paderborn.de), Linus Jungemann (linus.jungemann@uni-paderborn.de) and others
- * @brief Implements a wrapper for the circular buffer of boost
- * @version 2.0
- * @date 2023-11-14
- *
- * @copyright Copyright (c) 2023
- * @license All rights reserved. This program and the accompanying materials are
- * made available under the terms of the MIT license.
- *
- */
-
-#ifndef RINGBUFFER
-#define RINGBUFFER
-
-#include <FINNCppDriver/utils/FinnUtils.h>
-#include <FINNCppDriver/utils/Logger.h>
-
-#include <algorithm>
-#include <atomic>
-#include <boost/circular_buffer.hpp>
-#include <condition_variable>
-#include <iterator>
-#include <mutex>
-#include <numeric>
-#include <span>
-#include <syncstream>
-#include <thread>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-namespace Finn {
-    /**
-     * @brief Wrapper class for boost::circular_buffer, which handles abstraction.
-     *
-     * @tparam T
-     */
-    template<typename T, bool multiThreaded = false>
-    class RingBuffer {
-        boost::circular_buffer<T> buffer;
-
-        std::mutex readWriteMutex;
-        std::condition_variable cv;
-
-        std::size_t elementsPerPart;
-
-        /**
-         * @brief A small prefix to determine the source of the log write
-         *
-         * @return std::string
-         */
-        std::string static loggerPrefix() { return "[RingBuffer] "; }
-
-        std::size_t freeSpaceNotLocked() const { return buffer.capacity() - buffer.size(); }
-
-         public:
-        /**
-         * @brief Construct a new Ring Buffer object. It's size in terms of values of
-         * type T is given by pElementsPerPart * pParts. By default all parts are
-         * invalid data to start with.
-         *
-         * @param pParts
-         * @param pElementsPerPart
-         */
-        RingBuffer(const size_t pParts, const size_t pElementsPerPart) : buffer(pElementsPerPart * pParts), elementsPerPart(pElementsPerPart) {
-            auto logger = Logger::getLogger();
-            FINN_LOG(logger, loglevel::info) << "Ringbuffer initialised with " << pElementsPerPart << " Elements per Part and " << pParts << " Parts.\n";
-            if (pElementsPerPart * pParts == 0) {
-                FinnUtils::logAndError<std::runtime_error>("It is not possible to create a buffer of size 0!");
-            }
-        }
-
-        /**
-         * @brief Construct a new Ring Buffer object (Move constructor)
-         *
-         * @param other
-         */
-        RingBuffer(RingBuffer&& other) noexcept : buffer(std::move(other.buffer)), elementsPerPart(other.elementsPerPart) {}
-
-        RingBuffer(const RingBuffer& other) = delete;
-        virtual ~RingBuffer() = default;
-        RingBuffer& operator=(RingBuffer&& other) = delete;
-        RingBuffer& operator=(const RingBuffer& other) = delete;
-
-        /**
-         * @brief Tests if ring buffer is empty
-         *
-         * @return true success
-         * @return false failure
-         */
-        bool empty() {
-            if constexpr (multiThreaded) {
-                std::lock_guard guard(readWriteMutex);
-                return buffer.empty();
-            } else {
-                return buffer.empty();
-            }
-        }
-
-        /**
-         * @brief Tests if ring buffer is full
-         *
-         * @return true success
-         * @return false failure
-         */
-        bool full() {
-            if constexpr (multiThreaded) {
-                std::lock_guard guard(readWriteMutex);
-                return buffer.full();
-            } else {
-                // std::cout << "" << "\n";
-                return buffer.full();
-            }
-        }
-
-        /**
-         * @brief Get the availble free space in the driver
-         *
-         * @return std::size_t
-         */
-        std::size_t freeSpace() {
-            if constexpr (multiThreaded) {
-                std::lock_guard guard(readWriteMutex);
-                return buffer.capacity() - buffer.size();
-            } else {
-                return buffer.capacity() - buffer.size();
-            }
-        }
-
-        /**
-         *
-         * @brief Return the RingBuffer's size, either in elements of T, in bytes or in parts
-         *
-         * @param ss
-         * @return size_t
-         */
-        size_t size(SIZE_SPECIFIER ss) const {
-            if (ss == SIZE_SPECIFIER::TOTAL_DATA_SIZE) {
-                return buffer.capacity();
-            } else if (ss == SIZE_SPECIFIER::BYTES) {
-                return buffer.capacity() * sizeof(T);
-            } else if (ss == SIZE_SPECIFIER::BATCHSIZE) {
-                return buffer.capacity() / elementsPerPart;
-            } else if (ss == SIZE_SPECIFIER::FEATUREMAP_SIZE) {
-                return elementsPerPart;
-            } else {
-                FinnUtils::logAndError<std::runtime_error>("Unknown size specifier!");
-                return 0;
-            }
-        }
-
-        /**
-         * @brief Get the number of batch elements that can be stored in the buffer
-         *
-         * @return size_t
-         */
-        size_t size() {
-            if constexpr (multiThreaded) {
-                std::lock_guard guard(readWriteMutex);
-                return buffer.size() / elementsPerPart;
-            } else {
-                return buffer.size() / elementsPerPart;
-            }
-        }
-
-        /**
-         * @brief Stores data in the ring buffer. In singlethreaded mode, it returns
-         * false if data could not be stored and true otherwise. In multithreaded
-         * mode, the method will block until data can be stored.
-         *
-         * @tparam IteratorType
-         * @param first
-         * @param last
-         * @return true
-         * @return false
-         */
-        template<typename IteratorType>
-        bool store(IteratorType first, IteratorType last) {
-            const std::size_t datasize = std::abs(std::distance(first, last));
-            if (datasize % elementsPerPart != 0) {
-                FinnUtils::logAndError<std::runtime_error>("It is not possible to store data that is not a multiple of a part! Datasize: " + std::to_string(datasize) + ", Elements per Part: " + std::to_string(elementsPerPart) + "\n");
-            }
-            if (datasize > buffer.capacity()) {
-                FinnUtils::logAndError<std::runtime_error>("It is not possible to store more data in the buffer, than capacity available!");
-            }
-            if constexpr (multiThreaded) {
-                // lock buffer
-                std::unique_lock lk(readWriteMutex);
-                if (datasize > freeSpaceNotLocked()) {
-                    // go to sleep and wait until enough space available
-                    cv.wait(lk, [&datasize, this] { return datasize <= freeSpaceNotLocked(); });
-                }
-                // put data into buffer
-                buffer.insert(buffer.end(), first, last);
-
-                // Manual unlocking is done before notifying, to avoid waking up
-                // the waiting thread only to block again
-                lk.unlock();
-                cv.notify_one();
-                return true;
-
-            } else {
-                if (datasize > freeSpaceNotLocked()) {
-                    // Data could not be stored
-                    return false;
-                }
-                // put data into buffer
-                buffer.insert(buffer.end(), first, last);
-                return true;
-            }
-        }
-
-        /**
-         * @brief Store input data in the buffer
-         *
-         * @tparam IteratorType
-         * @param data
-         * @param datasize
-         * @return true
-         * @return false
-         */
-        template<typename IteratorType>
-        bool store(const IteratorType data, size_t datasize) {
-            return store(data, data + datasize);
-        }
-
-        /**
-         * @brief Store input data in the buffer
-         *
-         * @param vec
-         * @return true
-         * @return false
-         */
-        bool store(const std::vector<T> vec) { return store(vec.begin(), vec.end()); }
-
-        /**
-         * @brief Read the ring buffer and write out the first valid entry into the
-         * provided storage container. If no valid part is found, false is returned in
-         * the singlethreaded case. While multithreading, the method blocks instead.
-         *
-         * @tparam IteratorType
-         * @param outputIt
-         * @param stoken Needed for threaded operation. Do not set by hand!
-         * @return true
-         * @return false
-         */
-        template<typename IteratorType>
-        bool read(IteratorType outputIt, std::stop_token stoken = {}) {
-            if constexpr (multiThreaded) {
-                // lock buffer
-                std::unique_lock lk(readWriteMutex);
-
-                if (buffer.size() < elementsPerPart) {
-                    // Not enough data so block
-                    // go to sleep and wait until enough data available
-                    using namespace std::literals::chrono_literals;
-                    while (!cv.wait_for(lk, 2000ms, [this] { return buffer.size() >= elementsPerPart; })) {
-                        if (stoken.stop_requested()) {
-                            return false;
-                        }
-                    }
-                }
-
-                // read data
-                auto begin = buffer.begin();
-                std::copy(begin, begin + elementsPerPart, outputIt);
-                buffer.erase(begin, begin + elementsPerPart);
-
-                // Manual unlocking is done before notifying, to avoid waking up
-                // the waiting thread only to block again
-                lk.unlock();
-                cv.notify_one();
-                return true;
-
-            } else {
-                if (buffer.size() < elementsPerPart) {
-                    // Not enough data so fail
-                    return false;
-                }
-
-                auto begin = buffer.begin();
-                std::copy(begin, begin + elementsPerPart, outputIt);
-                buffer.erase(begin, begin + elementsPerPart);
-                return true;
-            }
-        }
-
-        /**
-         * @brief Read the ring buffer and write out the valid entries into the
-         * provided storage container. Read data is invalidated. If no valid part is found, false is returned
-         *
-         * @tparam IteratorType
-         * @param outputIt
-         * @return true
-         * @return false
-         */
-        template<typename IteratorType>
-        bool readAllValidParts(IteratorType outputIt) {
-            if constexpr (multiThreaded) {
-                std::unique_lock lk(readWriteMutex);
-                if (buffer.empty()) {
-                    return false;
-                }
-
-                std::copy(buffer.begin(), buffer.end(), outputIt);
-                buffer.clear();
-
-                // Manual unlocking is done before notifying, to avoid waking up
-                // the waiting thread only to block again
-                lk.unlock();
-                cv.notify_one();
-                return true;
-            } else {
-                if (buffer.empty()) {
-                    return false;
-                }
-
-                std::copy(buffer.begin(), buffer.end(), outputIt);
-                buffer.clear();
-
-                return true;
-            }
-        }
-
-        /**
-         * @brief Read the ring buffer and write out the valid entries into the
-         * provided storage container. If no valid part is found, false is returned
-         *
-         * @tparam IteratorType
-         * @param outputIt
-         * @param index
-         * @return true
-         * @return false
-         */
-        template<typename IteratorType>
-        bool readWithoutInvalidation(IteratorType outputIt, int index = -1) {
-            if constexpr (multiThreaded) {
-                std::unique_lock lk(readWriteMutex);
-                if (buffer.empty()) {
-                    return false;
-                }
-
-                if (index == -1) {
-                    std::copy(buffer.begin(), buffer.end(), outputIt);
-                } else {
-                    std::copy(buffer.begin() + elementsPerPart * index, buffer.begin() + elementsPerPart * (index + 1), outputIt);
-                }
-
-
-                // Manual unlocking is done before notifying, to avoid waking up
-                // the waiting thread only to block again
-                lk.unlock();
-                cv.notify_one();
-                return true;
-            } else {
-                if (buffer.empty()) {
-                    return false;
-                }
-
-                if (index == -1) {
-                    std::copy(buffer.begin(), buffer.end(), outputIt);
-                } else {
-                    std::copy(buffer.begin() + elementsPerPart * index, buffer.begin() + elementsPerPart * (index + 1), outputIt);
-                }
-
-                return true;
-            }
-        }
-    };
-}  // namespace Finn
-
-
-#endif  // RINGBUFFER
diff --git a/src/FINNCppDriver/utils/SPSCQueue.hpp b/src/FINNCppDriver/utils/SPSCQueue.hpp
new file mode 100644
index 0000000..ee5750f
--- /dev/null
+++ b/src/FINNCppDriver/utils/SPSCQueue.hpp
@@ -0,0 +1,1722 @@
+/**
+ * @file SPSCQueue.hpp
+ * @author Linus Jungemann (linus.jungemann@uni-paderborn.de) and others
+ * @brief Single-Producer, Single-Consumer lock-free queue implementation
+ * @version 1.0
+ * @date 2025-06-23
+ *
+ * @copyright Copyright (c) 2025
+ * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
+ *
+ * This file provides a highly optimized SPSC queue implementation with
+ * support for blocking and non-blocking operations, bulk transfers,
+ * and various CPU-specific optimizations.
+ */
+
+#ifndef SPSC_QUEUE_HPP
+#define SPSC_QUEUE_HPP
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <bit>
+#include <concepts>
+#include <condition_variable>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <new>
+#include <stop_token>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+using namespace std::literals::chrono_literals;
+
+// For CPU-specific optimizations
+#if defined(__x86_64__) || defined(_M_X64)
+    #include <immintrin.h>
+#elif defined(__aarch64__)
+    #include <arm_neon.h>
+#endif
+
+/**
+ * @brief Namespace containing implementation details for the SPSCQueue
+ *
+ * This namespace contains various helper utilities, traits, and optimized
+ * functions that support the main SPSCQueue implementation.
+ */
+namespace detail {
+    /**
+     * @brief Enumeration of supported SIMD instruction sets
+     *
+     * Used to detect and select the appropriate SIMD implementation
+     * for memory operations based on the target platform.
+     */
+    enum class SIMDSupport {
+        None,    ///< No SIMD support
+        SSE2,    ///< x86_64 baseline SIMD (128-bit)
+        AVX,     ///< 256-bit SIMD instructions
+        AVX2,    ///< Enhanced AVX instructions
+        AVX512,  ///< 512-bit SIMD instructions
+        NEON     ///< ARM SIMD instructions
+    };
+
+    /**
+     * @brief Detects the available SIMD support for the current platform
+     *
+     * @return The highest level of SIMD support available on the current platform
+     */
+    inline SIMDSupport detect_simd_support() {
+#if defined(__x86_64__) || defined(_M_X64)
+    #if defined(__AVX512F__)
+        return SIMDSupport::AVX512;
+    #elif defined(__AVX2__)
+        return SIMDSupport::AVX2;
+    #elif defined(__AVX__)
+        return SIMDSupport::AVX;
+    #elif defined(__SSE2__)
+        return SIMDSupport::SSE2;
+    #else
+        return SIMDSupport::None;
+    #endif
+#elif defined(__aarch64__)
+        return SIMDSupport::NEON;
+#else
+        return SIMDSupport::None;
+#endif
+    }
+
+    /**
+     * @brief SIMD-optimized memory copy function
+     *
+     * Uses the appropriate SIMD instructions based on the target platform
+     * to efficiently copy data between memory locations.
+     *
+     * @tparam T Type of elements to copy
+     * @param dst Destination pointer (must not overlap with source)
+     * @param src Source pointer
+     * @param count Number of elements to copy
+     */
+    template<typename T>
+    inline void simd_memcpy(T* __restrict dst, const T* __restrict src, size_t count) {
+        static constexpr bool is_suitable_for_simd = std::is_trivially_copyable_v<T> && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+        // Convert to byte pointers for SIMD operations
+        char* d = reinterpret_cast<char*>(dst);
+        const char* s = reinterpret_cast<const char*>(src);
+        const size_t bytes = count * sizeof(T);
+
+        // For small copies or non-SIMD-friendly types, use memcpy directly
+        if (!is_suitable_for_simd || bytes < 128) {
+            std::memcpy(d, s, bytes);
+            return;
+        }
+
+        static const SIMDSupport simd_level = detect_simd_support();
+
+#if defined(__x86_64__) || defined(_M_X64)
+        // Check if both pointers are aligned for SIMD
+        const bool is_aligned = (reinterpret_cast<uintptr_t>(d) % 32 == 0) && (reinterpret_cast<uintptr_t>(s) % 32 == 0);
+
+    #if defined(__AVX512F__)
+        if (simd_level >= SIMDSupport::AVX512 && bytes >= 64) {
+            // AVX-512 implementation (64-byte blocks)
+            size_t i = 0;
+
+            // Handle 64-byte blocks with AVX-512
+            for (; i + 64 <= bytes; i += 64) {
+                __m512i data = is_aligned ? _mm512_load_si512(reinterpret_cast<const __m512i*>(s + i)) : _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + i));
+
+                if (is_aligned) {
+                    _mm512_store_si512(reinterpret_cast<__m512i*>(d + i), data);
+                } else {
+                    _mm512_storeu_si512(reinterpret_cast<__m512i*>(d + i), data);
+                }
+            }
+
+            // Handle remainder with standard memcpy
+            if (i < bytes) {
+                std::memcpy(d + i, s + i, bytes - i);
+            }
+            return;
+        }
+    #endif
+
+    #if defined(__AVX2__) || defined(__AVX__)
+        if (simd_level >= SIMDSupport::AVX && bytes >= 32) {
+            // AVX/AVX2 implementation (32-byte blocks)
+            size_t i = 0;
+
+            // Handle 32-byte blocks with AVX
+            for (; i + 32 <= bytes; i += 32) {
+                __m256i data = is_aligned ? _mm256_load_si256(reinterpret_cast<const __m256i*>(s + i)) : _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
+
+                if (is_aligned) {
+                    _mm256_store_si256(reinterpret_cast<__m256i*>(d + i), data);
+                } else {
+                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(d + i), data);
+                }
+            }
+
+            // Handle remainder with standard memcpy
+            if (i < bytes) {
+                std::memcpy(d + i, s + i, bytes - i);
+            }
+            return;
+        }
+    #endif
+
+    #if defined(__SSE2__)
+        if (simd_level >= SIMDSupport::SSE2 && bytes >= 16) {
+            // SSE2 implementation (16-byte blocks)
+            size_t i = 0;
+
+            // Handle 16-byte blocks with SSE2
+            for (; i + 16 <= bytes; i += 16) {
+                __m128i data = is_aligned ? _mm_load_si128(reinterpret_cast<const __m128i*>(s + i)) : _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
+
+                if (is_aligned) {
+                    _mm_store_si128(reinterpret_cast<__m128i*>(d + i), data);
+                } else {
+                    _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), data);
+                }
+            }
+
+            // Handle remainder with standard memcpy
+            if (i < bytes) {
+                std::memcpy(d + i, s + i, bytes - i);
+            }
+            return;
+        }
+    #endif
+
+#elif defined(__aarch64__)
+        if (simd_level == SIMDSupport::NEON && bytes >= 16) {
+            // NEON implementation (16-byte blocks)
+            size_t i = 0;
+
+            // Handle 16-byte blocks with NEON
+            for (; i + 16 <= bytes; i += 16) {
+                uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
+                vst1q_u8(reinterpret_cast<uint8_t*>(d + i), data);
+            }
+
+            // Handle remainder with standard memcpy
+            if (i < bytes) {
+                std::memcpy(d + i, s + i, bytes - i);
+            }
+            return;
+        }
+#endif
+
+        // Fallback to standard memcpy
+        std::memcpy(d, s, bytes);
+    }
+
+    /**
+     * @brief Type trait to detect smart pointer types
+     *
+     * @tparam T Type to check
+     * @tparam Void SFINAE helper
+     */
+    template<typename T, typename = void>
+    struct is_smart_pointer : std::false_type {};
+
+    /**
+     * @brief Specialization for detecting smart pointer types with common operations
+     *
+     * Detects types that have operator*, operator->, and get() methods
+     * which are common for smart pointer implementations.
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    struct is_smart_pointer<T, std::void_t<decltype(std::declval<T>().operator*()), decltype(std::declval<T>().operator->()), decltype(std::declval<T>().get())>> : std::true_type {};
+
+    /**
+     * @brief Specialization for std::weak_ptr
+     *
+     * @tparam T Contained type
+     */
+    template<typename T>
+    struct is_smart_pointer<std::weak_ptr<T>> : std::true_type {};
+
+    /**
+     * @brief Helper variable template for is_smart_pointer
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    inline constexpr bool is_smart_pointer_v = is_smart_pointer<T>::value;
+
+    /**
+     * @brief Type trait to detect container-like types
+     *
+     * @tparam T Type to check
+     * @tparam Void SFINAE helper
+     */
+    template<typename T, typename = void>
+    struct is_container_like : std::false_type {};
+
+    /**
+     * @brief Specialization for detecting container-like types
+     *
+     * Detects types that have begin(), end(), and size() methods
+     * which are common for container implementations.
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    struct is_container_like<T, std::void_t<decltype(std::declval<T>().begin()), decltype(std::declval<T>().end()), decltype(std::declval<T>().size())>> : std::true_type {};
+
+    /**
+     * @brief Helper variable template for is_container_like
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    inline constexpr bool is_container_like_v = is_container_like<T>::value;
+
+    /**
+     * @brief Checks if a type has custom resource management
+     *
+     * Detects types that have custom destructors and move operations,
+     * which often indicate resource management.
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    inline constexpr bool has_custom_resource_management_v = !std::is_trivially_destructible_v<T> && (!std::is_trivially_move_constructible_v<T> || !std::is_trivially_move_assignable_v<T>);
+
+    /**
+     * @brief Type trait to detect types with problematic move semantics
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    struct has_problematic_move_semantics {
+        /**
+         * @brief Explicit list of known problematic types
+         *
+         * These types are known to have issues with move-then-destroy patterns
+         */
+        static constexpr bool explicit_list = std::is_same_v<T, std::string> || std::is_same_v<T, std::vector<bool>> ||  // vector<bool> is special
+                                              false;                                                                     // Extensible for other specific cases
+
+        /**
+         * @brief Heuristic detection for potentially problematic types
+         *
+         * Uses type traits to identify types that might have issues
+         * when moved from and then destroyed
+         */
+        static constexpr bool heuristic_detection = has_custom_resource_management_v<T> && is_container_like_v<T> && !std::is_trivially_copyable_v<T>;
+
+        /**
+         * @brief Combined detection result
+         */
+        static constexpr bool value = explicit_list || heuristic_detection;
+    };
+
+    /**
+     * @brief Helper variable template for has_problematic_move_semantics
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    inline constexpr bool has_problematic_move_semantics_v = has_problematic_move_semantics<T>::value;
+
+    /**
+     * @brief Type trait to detect types that are unsafe to destroy after moving from
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    struct unsafe_to_destroy_after_move : std::bool_constant<std::is_pointer_v<T> || is_smart_pointer_v<T> || has_problematic_move_semantics_v<T>> {};
+
+    /**
+     * @brief Helper variable template for unsafe_to_destroy_after_move
+     *
+     * @tparam T Type to check
+     */
+    template<typename T>
+    inline constexpr bool unsafe_to_destroy_after_move_v = unsafe_to_destroy_after_move<T>::value;
+
+    /**
+     * @brief Prefetches memory for read access
+     *
+     * Provides a hint to the CPU to prefetch memory into cache
+     * for upcoming read operations.
+     *
+     * @param ptr Pointer to memory to prefetch
+     * @param locality Temporal locality hint (0-3, where 3 means high locality)
+     */
+    static inline void prefetch_read(const void* ptr, int locality = 3) noexcept {
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_prefetch(ptr, 0, locality);  // Read with configurable locality
+#endif
+    }
+
+    /**
+     * @brief Prefetches memory for write access
+     *
+     * Provides a hint to the CPU to prefetch memory into cache
+     * for upcoming write operations.
+     *
+     * @param ptr Pointer to memory to prefetch
+     * @param locality Temporal locality hint (0-3, where 3 means high locality)
+     */
+    static inline void prefetch_write(const void* ptr, int locality = 3) noexcept {
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_prefetch(ptr, 1, locality);  // Write with configurable locality
+#endif
+    }
+
+    /**
+     * @brief Executes a CPU pause instruction
+     *
+     * Used in spin-wait loops to reduce power consumption and
+     * improve performance on hyper-threaded processors.
+     */
+    static inline void cpu_pause() noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+        _mm_pause();
+#elif defined(__aarch64__)
+        asm volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+        asm volatile("or 27,27,27" ::: "memory");
+#else
+        std::this_thread::yield();  // Fallback to standard yield
+#endif
+    }
+
+    /**
+     * @brief Implements an exponential backoff strategy for spin-waiting
+     *
+     * Gradually increases the delay between retries to reduce contention
+     * and power consumption during spin-waiting.
+     */
+    class exponential_backoff {
+         private:
+        int current_delay = 1;  ///< Current delay count
+        const int max_delay;    ///< Maximum delay limit
+
+         public:
+        /**
+         * @brief Constructs an exponential backoff object
+         *
+         * @param max Maximum delay value (default: 1024)
+         */
+        explicit exponential_backoff(int max = 1024) : max_delay(max) {}
+
+        /**
+         * @brief Executes the backoff delay and increases the delay for next time
+         */
+        void operator()() noexcept {
+            for (int i = 0; i < current_delay; ++i) {
+                cpu_pause();
+            }
+
+            // Exponentially increase delay, up to max_delay
+            current_delay = std::min(current_delay * 2, max_delay);
+        }
+
+        /**
+         * @brief Resets the delay back to initial value
+         */
+        void reset() noexcept { current_delay = 1; }
+    };
+
+    /**
+     * @brief Base class for SPSCQueue implementations
+     *
+     * Contains common implementation details shared by all specializations
+     * of the SPSCQueue.
+     *
+     * @tparam T Element type
+     * @tparam ActualCapacity Actual capacity of the queue (power of 2)
+     * @tparam IsTrivial Whether T is a trivially copyable type
+     */
+    template<typename T, size_t ActualCapacity, bool IsTrivial>
+    class SPSCQueueBase {
+         protected:
+        static constexpr size_t CACHE_LINE_SIZE = 64;             ///< Size of a cache line in bytes
+        static constexpr size_t INDEX_MASK = ActualCapacity - 1;  ///< Mask for index wrapping
+
+        /**
+         * @brief Cache-aligned atomic size_t with padding to prevent false sharing
+         */
+        struct alignas(CACHE_LINE_SIZE) AlignedAtomicSize {
+            std::atomic<size_t> value{0};  ///< The atomic value
+            /// Padding to fill a complete cache line
+            char padding[CACHE_LINE_SIZE - sizeof(std::atomic<size_t>)];
+
+            /**
+             * @brief Loads the current value
+             *
+             * @param order Memory order for the operation
+             * @return Current value
+             */
+            size_t load(std::memory_order order = std::memory_order_seq_cst) const noexcept { return value.load(order); }
+
+            /**
+             * @brief Stores a new value
+             *
+             * @param desired Value to store
+             * @param order Memory order for the operation
+             */
+            void store(size_t desired, std::memory_order order = std::memory_order_seq_cst) noexcept { value.store(desired, order); }
+        };
+
+        // Cache-aligned elements to prevent false sharing
+        alignas(CACHE_LINE_SIZE) std::array<T, ActualCapacity> buffer_;  ///< Element storage buffer
+
+        alignas(CACHE_LINE_SIZE) AlignedAtomicSize head_;  ///< Consumer position
+        char head_padding_[CACHE_LINE_SIZE];               ///< Extra padding between head and tail
+
+        alignas(CACHE_LINE_SIZE) AlignedAtomicSize tail_;  ///< Producer position
+        char tail_padding_[CACHE_LINE_SIZE];               ///< Extra padding after tail
+
+        /**
+         * @brief State for blocking operations
+         */
+        alignas(CACHE_LINE_SIZE) struct BlockingState {
+            mutable std::mutex mutex_;           ///< Mutex for blocking operations
+            std::condition_variable not_full_;   ///< CV for space available notifications
+            std::condition_variable not_empty_;  ///< CV for item available notifications
+            std::atomic<bool> is_active_{true};  ///< Whether the queue is active
+            char padding[CACHE_LINE_SIZE];       ///< Padding to fill a complete cache line
+        } blocking_;
+
+        static constexpr int SPIN_ATTEMPTS = 1000;  ///< Number of spin attempts before blocking
+        static constexpr int YIELD_ATTEMPTS = 50;   ///< Number of yield attempts during spinning
+
+        /**
+         * @brief Calculates the number of items available for consumption
+         *
+         * @return Number of items available
+         */
+        size_t available_items() const noexcept {
+            const size_t head = head_.load(std::memory_order_relaxed);
+            const size_t tail = tail_.load(std::memory_order_acquire);
+            return (tail - head) & INDEX_MASK;
+        }
+
+        /**
+         * @brief Calculates the available space for production
+         *
+         * @return Number of free slots available
+         */
+        size_t available_space() const noexcept {
+            const size_t head = head_.load(std::memory_order_acquire);
+            const size_t tail = tail_.load(std::memory_order_relaxed);
+            return ((head - tail - 1) & INDEX_MASK);
+        }
+    };
+}  // namespace detail
+
+/**
+ * @brief Single-Producer Single-Consumer lock-free queue
+ *
+ * A high-performance queue designed for the single-producer,
+ * single-consumer scenario. Features include:
+ * - Lock-free operations for high throughput
+ * - Blocking and non-blocking interfaces
+ * - Bulk transfer operations
+ * - SIMD-optimized memory operations
+ * - Cache-friendly design to minimize false sharing
+ * - Support for stop tokens for cancellation
+ *
+ * @tparam T Element type (must be movable)
+ * @tparam RequestedCapacity Desired minimum capacity
+ */
+template<typename T, size_t RequestedCapacity>
+    requires std::movable<T>
+class SPSCQueue : private detail::SPSCQueueBase<T, std::bit_ceil(RequestedCapacity), std::is_trivially_copyable_v<T>> {
+     private:
+    // Import base members into this scope
+    using Base = detail::SPSCQueueBase<T, std::bit_ceil(RequestedCapacity), std::is_trivially_copyable_v<T>>;
+    using Base::blocking_;
+    using Base::buffer_;
+    using Base::head_;
+    using Base::INDEX_MASK;
+    using Base::SPIN_ATTEMPTS;
+    using Base::tail_;
+    using Base::YIELD_ATTEMPTS;
+
+    /// Actual capacity rounded up to the next power of 2
+    static constexpr size_t ActualCapacity = std::bit_ceil(RequestedCapacity);
+
+     public:
+    /**
+     * @brief Constructs an empty queue
+     *
+     * Initializes an empty queue with the specified capacity.
+     * The actual capacity will be rounded up to the next power of 2,
+     * with one slot reserved for implementation purposes.
+     */
+    constexpr SPSCQueue() noexcept { static_assert(ActualCapacity >= 2, "Queue capacity must be at least 2"); }
+
+    /**
+     * @brief Destructor
+     *
+     * Wakes up any waiting threads and properly destroys
+     * any remaining elements in the queue.
+     */
+    ~SPSCQueue() {
+        // Wake up any waiting threads and destroy remaining elements
+        blocking_.is_active_.store(false, std::memory_order_release);
+        blocking_.not_empty_.notify_all();  // Notify all instead of just one
+        blocking_.not_full_.notify_all();
+
+        // Clean up any remaining elements if not trivially destructible
+        if constexpr (!std::is_trivially_destructible_v<T>) {
+            size_t head = head_.load(std::memory_order_relaxed);
+            size_t tail = tail_.load(std::memory_order_relaxed);
+
+            while (head != tail) {
+                buffer_[head].~T();
+                head = (head + 1) & INDEX_MASK;
+            }
+        }
+    }
+
+    //////////ENQUEUE OPERATIONS//////////
+
+    /**
+     * @brief Attempts to enqueue an element (copy version)
+     *
+     * Non-blocking operation that attempts to add an item to the queue.
+     *
+     * @param item Element to enqueue
+     * @return true if successful, false if the queue was full
+     */
+    bool try_enqueue(const T& item) noexcept(std::is_nothrow_copy_constructible_v<T>) {
+        const size_t current_tail = tail_.load(std::memory_order_relaxed);
+        const size_t next_tail = (current_tail + 1) & INDEX_MASK;
+
+        // Relaxed load followed by acquire if needed (optimization)
+        if (next_tail == head_.load(std::memory_order_relaxed)) {
+            if (next_tail == head_.load(std::memory_order_acquire))
+                return false;
+        }
+
+        // Prefetch with locality hint for next operation
+        detail::prefetch_write(&buffer_[current_tail], 3);
+
+        // For trivially copyable small types, direct assignment is faster than placement new
+        if constexpr (std::is_trivially_copyable_v<T> && sizeof(T) <= 16) {
+            buffer_[current_tail] = item;
+        } else {
+            new (&buffer_[current_tail]) T(item);
+        }
+
+        // Release memory ordering ensures visibility to consumer
+        tail_.store(next_tail, std::memory_order_release);
+
+        // Only notify if queue was empty (reduces contention)
+        if (current_tail == head_.load(std::memory_order_relaxed))
+            blocking_.not_empty_.notify_one();
+
+        return true;
+    }
+
+    /**
+     * @brief Attempts to enqueue an element (move version)
+     *
+     * Non-blocking operation that attempts to add an item to the queue
+     * using move semantics for better performance.
+     *
+     * @param item Element to enqueue
+     * @return true if successful, false if the queue was full
+     */
+    bool try_enqueue(T&& item) noexcept(std::is_nothrow_move_constructible_v<T>) {
+        const size_t current_tail = tail_.load(std::memory_order_relaxed);
+        const size_t next_tail = (current_tail + 1) & INDEX_MASK;
+
+        // Optimization: Relaxed load first, then acquire if needed
+        if (next_tail == head_.load(std::memory_order_relaxed)) {
+            // Double-check with acquire semantics
+            if (next_tail == head_.load(std::memory_order_acquire))
+                return false;  // Queue is full
+        }
+
+        // Optimization: Prefetch for write to reduce cache misses
+        detail::prefetch_write(&buffer_[current_tail]);
+
+        new (&buffer_[current_tail]) T(std::move(item));
+        tail_.store(next_tail, std::memory_order_release);
+
+        // Notify consumer if queue was empty
+        if (current_tail == head_.load(std::memory_order_relaxed))
+            blocking_.not_empty_.notify_one();
+
+        return true;
+    }
+
+    /**
+     * @brief Enqueues an element, blocking if necessary (copy version)
+     *
+     * Blocks the calling thread until space is available in the queue.
+     *
+     * @param item Element to enqueue
+     */
+    void enqueue(const T& item) {
+        // Try fast path first
+        if (try_enqueue(item))
+            return;
+
+        // Slow path with blocking
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        blocking_.not_full_.wait(lock, [this, &item] { return try_enqueue(item) || !blocking_.is_active_.load(std::memory_order_acquire); });
+    }
+
+    /**
+     * @brief Enqueues an element, blocking if necessary (move version)
+     *
+     * Blocks the calling thread until space is available in the queue.
+     * Uses move semantics for better performance.
+     *
+     * @param item Element to enqueue
+     */
+    void enqueue(T&& item) {
+        // Try fast path first
+        if (try_enqueue(std::move(item)))
+            return;
+
+        // Slow path with blocking
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        blocking_.not_full_.wait(lock, [this, &item] { return try_enqueue(std::move(item)) || !blocking_.is_active_.load(std::memory_order_acquire); });
+    }
+
+    /**
+     * @brief Enqueues an element with cancellation support (copy version)
+     *
+     * Blocks until space is available or the operation is cancelled.
+     *
+     * @tparam StopToken Type meeting the StopToken concept
+     * @param item Element to enqueue
+     * @param stop_token Token that can be used to cancel the operation
+     * @return true if the element was enqueued, false if cancelled
+     */
+    template<typename StopToken>
+    bool enqueue(const T& item, StopToken&& stop_token) {
+        // Try fast path first
+        if (try_enqueue(item))
+            return true;
+
+        // Slow path with blocking and cancellation support
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        // Wait until space available, queue inactive, or stop requested
+        std::condition_variable_any{}.wait(lock, stop_token, [this, &item] { return try_enqueue(item) || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // Check if enqueue succeeded or stopped
+        return !stop_token.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /**
+     * @brief Enqueues an element with cancellation support (move version)
+     *
+     * Blocks until space is available or the operation is cancelled.
+     * Uses move semantics for better performance.
+     *
+     * @tparam StopToken Type meeting the StopToken concept
+     * @param item Element to enqueue
+     * @param stop_token Token that can be used to cancel the operation
+     * @return true if the element was enqueued, false if cancelled
+     */
+    template<typename StopToken>
+    bool enqueue(T&& item, StopToken&& stop_token) {
+        // Try fast path first
+        if (try_enqueue(std::move(item)))
+            return true;
+
+        // Slow path with blocking and cancellation support
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        blocking_.not_full_.wait(lock, [this, &item] { return try_enqueue(std::move(item)) || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // Check if enqueue succeeded or stopped
+        return !stop_token.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /**
+     * @brief Attempts to enqueue multiple elements in a single operation
+     *
+     * Non-blocking operation that attempts to add multiple items to the queue.
+     *
+     * @tparam InputIt Iterator type pointing to elements
+     * @param first Iterator to the first element to enqueue
+     * @param count Number of elements to enqueue
+     * @return Number of elements successfully enqueued
+     */
+    template<typename InputIt>
+    size_t try_enqueue_bulk(InputIt first, size_t count) noexcept {
+        if (count == 0)
+            return 0;
+
+        // Fast path with relaxed ordering first
+        const size_t current_tail = tail_.load(std::memory_order_relaxed);
+
+        // Calculate available space (optimized)
+        const size_t head = head_.load(std::memory_order_acquire);
+        const size_t capacity = ActualCapacity;
+        const size_t available_space = (head + capacity - current_tail - 1) & INDEX_MASK;
+
+        if (available_space == 0)
+            return 0;
+
+        // Calculate actual amount to copy
+        const size_t to_copy = std::min(available_space, count);
+        const bool was_empty = (current_tail == head);
+
+        // Optimize based on whether the enqueue wraps around the buffer
+        const size_t first_chunk = std::min(to_copy, capacity - current_tail);
+        const size_t second_chunk = to_copy - first_chunk;
+
+        // Use the fastest copy method based on type
+        if constexpr (std::is_trivially_copyable_v<T>) {
+            if constexpr (std::is_pointer_v<InputIt> && std::is_same_v<std::remove_pointer_t<InputIt>, T>) {
+                // Pointer to same type - use SIMD-optimized memory transfer
+                // Use SIMD for first chunk
+                detail::simd_memcpy(&buffer_[current_tail], first, first_chunk);
+
+                // Handle wrap-around if needed with SIMD
+                if (second_chunk > 0) {
+                    detail::simd_memcpy(&buffer_[0], first + first_chunk, second_chunk);
+                }
+            } else {
+                // Process first chunk
+                auto it = first;
+                for (size_t i = 0; i < first_chunk; i++) {
+                    buffer_[current_tail + i] = *it++;
+                }
+
+                // Process second chunk if needed
+                for (size_t i = 0; i < second_chunk; i++) {
+                    buffer_[i] = *it++;
+                }
+            }
+        } else {
+            // Non-trivially copyable type - use placement new with iterator
+            auto it = first;
+
+            // Process first chunk
+            for (size_t i = 0; i < first_chunk; i++) {
+                new (&buffer_[current_tail + i]) T(*it++);
+            }
+
+            // Process second chunk if needed
+            for (size_t i = 0; i < second_chunk; i++) {
+                new (&buffer_[i]) T(*it++);
+            }
+        }
+
+        // Update tail position with a single atomic operation
+        tail_.store((current_tail + to_copy) & INDEX_MASK, std::memory_order_release);
+
+        // Only notify if queue was empty before
+        if (was_empty) {
+            blocking_.not_empty_.notify_one();
+        }
+
+        return to_copy;
+    }
+
+    /**
+     * @brief Enqueues multiple elements, blocking if necessary
+     *
+     * Blocks until all elements are enqueued or the queue is shut down.
+     *
+     * @tparam InputIt Iterator type pointing to elements
+     * @param first Iterator to the first element to enqueue
+     * @param count Number of elements to enqueue
+     * @return Number of elements successfully enqueued
+     */
+    template<typename InputIt>
+    size_t enqueue_bulk(InputIt first, size_t count) {
+        if (count == 0)
+            return 0;
+
+        // Try non-blocking fast path first
+        size_t items_enqueued = try_enqueue_bulk(first, count);
+        if (items_enqueued == count) {
+            return items_enqueued;
+        }
+
+        // Advance iterator by items already enqueued
+        std::advance(first, items_enqueued);
+        size_t remaining = count - items_enqueued;
+
+        // Exponential backoff spinning before falling back to mutex
+        detail::exponential_backoff backoff;
+        for (int i = 0; i < SPIN_ATTEMPTS; i++) {  // Try spinning a few times first
+            size_t batch_enqueued = try_enqueue_bulk(first, remaining);
+            if (batch_enqueued > 0) {
+                std::advance(first, batch_enqueued);
+                items_enqueued += batch_enqueued;
+                remaining -= batch_enqueued;
+
+                if (items_enqueued == count) {
+                    return items_enqueued;
+                }
+            }
+            backoff();
+        }
+
+        // Fall back to mutex-based waiting
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        while (items_enqueued < count && blocking_.is_active_.load(std::memory_order_acquire)) {
+            // Wait until space is available
+            blocking_.not_full_.wait(lock, [this] { return !is_full() || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+            if (!blocking_.is_active_.load(std::memory_order_acquire)) {
+                break;  // Queue was shut down
+            }
+
+            // Critical section - minimize time with lock held
+            lock.unlock();
+
+            // Try to enqueue multiple items in one go
+            size_t batch_enqueued = try_enqueue_bulk(first, remaining);
+
+            lock.lock();
+
+            if (batch_enqueued > 0) {
+                std::advance(first, batch_enqueued);
+                items_enqueued += batch_enqueued;
+                remaining -= batch_enqueued;
+
+                if (items_enqueued == count) {
+                    break;
+                }
+            }
+        }
+
+        return items_enqueued;
+    }
+
+    /**
+     * @brief Enqueues multiple elements with a timeout
+     *
+     * Attempts to enqueue elements until the specified timeout expires.
+     *
+     * @tparam InputIt Iterator type pointing to elements
+     * @tparam Rep Duration representation type
+     * @tparam Period Duration period type
+     * @param first Iterator to the first element to enqueue
+     * @param count Number of elements to enqueue
+     * @param timeout Maximum time to wait
+     * @return Number of elements successfully enqueued
+     */
+    template<typename InputIt, typename Rep, typename Period>
+    size_t enqueue_bulk_for(InputIt first, size_t count, const std::chrono::duration<Rep, Period>& timeout) {
+        if (count == 0)
+            return 0;
+
+        // Track start time for timeout
+        auto start_time = std::chrono::steady_clock::now();
+        auto end_time = start_time + timeout;
+
+        // Try non-blocking fast path first
+        size_t items_enqueued = try_enqueue_bulk(first, count);
+        if (items_enqueued == count) {
+            return items_enqueued;
+        }
+
+        // Advance iterator by items already enqueued
+        std::advance(first, items_enqueued);
+        size_t remaining = count - items_enqueued;
+
+        // Adaptive spinning phase - use up to 20% of timeout for spinning
+        // Fix: convert both durations to microseconds for comparison
+        auto timeout_us = std::chrono::duration_cast<std::chrono::microseconds>(timeout);
+        auto spin_time = std::min(timeout_us / 5, std::chrono::microseconds(200));
+        auto spin_end_time = start_time + spin_time;
+
+        // Spin with exponential backoff
+        detail::exponential_backoff backoff;
+        while (items_enqueued < count && std::chrono::steady_clock::now() < spin_end_time) {
+            size_t batch_enqueued = try_enqueue_bulk(first, remaining);
+            if (batch_enqueued > 0) {
+                std::advance(first, batch_enqueued);
+                items_enqueued += batch_enqueued;
+                remaining -= batch_enqueued;
+
+                if (items_enqueued == count) {
+                    return items_enqueued;
+                }
+
+                // Reset backoff on progress
+                backoff.reset();
+            }
+            backoff();
+        }
+
+        // Check if timeout expired during spinning
+        if (std::chrono::steady_clock::now() >= end_time) {
+            return items_enqueued;
+        }
+
+        // Fall back to condition variable waiting
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        do {
+            // Wait until space is available or timeout
+            if (!blocking_.not_full_.wait_until(lock, end_time, [this] { return !is_full() || !blocking_.is_active_.load(std::memory_order_acquire); })) {
+                break;  // Timeout occurred
+            }
+
+            if (!blocking_.is_active_.load(std::memory_order_acquire)) {
+                break;  // Queue was shut down
+            }
+
+            // Release lock during actual enqueue operation
+            lock.unlock();
+            size_t batch_enqueued = try_enqueue_bulk(first, remaining);
+            lock.lock();
+
+            if (batch_enqueued > 0) {
+                std::advance(first, batch_enqueued);
+                items_enqueued += batch_enqueued;
+                remaining -= batch_enqueued;
+
+                if (items_enqueued == count) {
+                    break;  // All items enqueued
+                }
+            }
+
+        } while (items_enqueued < count && std::chrono::steady_clock::now() < end_time && blocking_.is_active_.load(std::memory_order_acquire));
+
+        return items_enqueued;
+    }
+
+
+    //////////DEQUEUE OPERATIONS//////////
+
+    /**
+     * @brief Attempts to dequeue an element
+     *
+     * Non-blocking operation that attempts to remove an item from the queue.
+     *
+     * @param item Reference to store the dequeued element
+     * @return true if successful, false if the queue was empty
+     */
+    bool try_dequeue(T& item) noexcept(std::is_nothrow_move_assignable_v<T>) {
+        const size_t current_head = head_.load(std::memory_order_relaxed);
+
+        // Early relaxed check before acquiring
+        if (current_head == tail_.load(std::memory_order_relaxed)) {
+            if (current_head == tail_.load(std::memory_order_acquire))
+                return false;
+        }
+
+        // Prefetch next items in queue for better throughput
+        const size_t next_head = (current_head + 1) & INDEX_MASK;
+        if (next_head != tail_.load(std::memory_order_relaxed)) {
+            detail::prefetch_read(&buffer_[next_head], 3);
+
+            const size_t next_next_head = (next_head + 1) & INDEX_MASK;
+            if (next_next_head != tail_.load(std::memory_order_relaxed)) {
+                detail::prefetch_read(&buffer_[next_next_head], 2);  // Lower locality hint
+            }
+        }
+
+        // Move the item out with optimization for trivial types
+        if constexpr (std::is_trivially_copyable_v<T> && sizeof(T) <= 16) {
+            item = buffer_[current_head];
+        } else {
+            item = std::move(buffer_[current_head]);
+
+            // Only call destructor if not an unsafe type after move
+            if constexpr (!detail::unsafe_to_destroy_after_move_v<T>) {
+                buffer_[current_head].~T();
+            }
+        }
+
+        // Release memory ordering ensures visibility to producer
+        head_.store(next_head, std::memory_order_release);
+
+        // Selective notification strategy
+        const size_t used_capacity = ((tail_.load(std::memory_order_relaxed) - next_head) & INDEX_MASK);
+        if (used_capacity < ActualCapacity / 4) {
+            blocking_.not_full_.notify_one();
+        }
+
+        return true;
+    }
+
+    /**
+     * @brief Dequeues an element, blocking if necessary
+     *
+     * Blocks the calling thread until an item is available in the queue.
+     *
+     * @param item Reference to store the dequeued element
+     * @return true if an element was dequeued, false if the queue was shut down
+     */
+    bool dequeue(T& item) {
+        // Try optimistic fast path first
+        if (try_dequeue(item))
+            return true;
+
+        // Use exponential backoff with CPU hints
+        detail::exponential_backoff backoff;
+        for (int i = 0; i < SPIN_ATTEMPTS; ++i) {
+            if (try_dequeue(item))
+                return true;
+            backoff();
+        }
+
+        // Fall back to blocking wait
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        blocking_.not_empty_.wait(lock, [this, &item] { return try_dequeue(item) || !blocking_.is_active_.load(std::memory_order_acquire); });
+        return blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /**
+     * @brief Add a timed wait method with adaptive waiting
+     *
+     * @tparam Rep Duration representation type
+     * @tparam Period Duration period type
+     * @param timeout Maximum time to wait
+     * @return true if an element was dequeued, false if timeout expired during spinning
+     */
+    template<typename Rep, typename Period>
+    bool dequeue_for(T& item, const std::chrono::duration<Rep, Period>& timeout) {
+        // Try fast path first
+        if (try_dequeue(item))
+            return true;
+
+        // Calculate how much time to allocate for spinning vs blocking
+        auto start_time = std::chrono::steady_clock::now();
+        auto spin_duration = std::min(timeout / 2, std::chrono::milliseconds(1));
+        auto spin_end_time = start_time + spin_duration;
+
+        // Spin with increasing backoff until spin time elapsed
+        detail::exponential_backoff backoff;
+        while (std::chrono::steady_clock::now() < spin_end_time) {
+            if (try_dequeue(item))
+                return true;
+
+            backoff();
+        }
+
+        // Calculate remaining time for blocking wait
+        auto current_time = std::chrono::steady_clock::now();
+        auto remaining = timeout - (current_time - start_time);
+        if (remaining <= std::chrono::duration<Rep, Period>::zero())
+            return false;  // Timeout already expired during spinning
+
+        // Slow path with timeout
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        return blocking_.not_empty_.wait_for(lock, remaining, [this, &item] { return try_dequeue(item) || !blocking_.is_active_.load(std::memory_order_acquire); }) && blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /**
+     * @brief Dequeues an element with cancellation support
+     *
+     * Blocks until an item is available or the operation is cancelled.
+     *
+     * @tparam StopToken Type meeting the StopToken concept
+     * @param item Reference to store the dequeued element
+     * @param stop_token Token that can be used to cancel the operation
+     * @return true if an element was dequeued, false if cancelled or queue shut down
+     */
+    template<typename StopToken>
+    bool dequeue(T& item, StopToken&& stop_token) {
+        // Try fast path first
+        if (try_dequeue(item))
+            return true;
+
+        // Slow path with blocking and cancellation support
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        // Wait until item available, queue inactive, or stop requested
+        std::condition_variable_any{}.wait(lock, stop_token, [this, &item] { return try_dequeue(item) || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // Check if we got an item or stopped
+        return !stop_token.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /**
+     * @brief Dequeues an element with timeout and cancellation support
+     *
+     * Attempts to dequeue an element, waiting up to the specified timeout
+     * or until the operation is cancelled.
+     *
+     * @tparam Rep Duration representation type
+     * @tparam Period Duration period type
+     * @tparam StopToken Type meeting the StopToken concept
+     * @param item Reference to store the dequeued element
+     * @param timeout Maximum time to wait
+     * @param stop_token Token that can be used to cancel the operation
+     * @return true if an element was dequeued, false otherwise
+     */
+    template<typename Rep, typename Period, typename StopToken>
+    bool dequeue_for(T& item, const std::chrono::duration<Rep, Period>& timeout, StopToken&& stop_token) {
+        // Try fast path first
+        if (try_dequeue(item))
+            return true;
+
+        // Slow path with timeout and cancellation support
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        // Wait until item available, timeout, queue inactive, or stop requested
+        std::condition_variable_any{}.wait_for(lock, timeout, stop_token, [this, &item] { return try_dequeue(item) || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // Return success only if we got an item (not stopped or timed out)
+        return !stop_token.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire) && !is_empty();
+    }
+
+    /**
+     * @brief Attempts to dequeue multiple elements in a single operation
+     *
+     * Non-blocking operation that attempts to remove multiple items from the queue.
+     *
+     * @tparam OutputIt Iterator type for destination
+     * @param dest Iterator to the destination to store dequeued elements
+     * @param max_items Maximum number of elements to dequeue
+     * @return Number of elements successfully dequeued
+     */
+    template<typename OutputIt>
+    size_t try_dequeue_bulk(OutputIt dest, size_t max_items) noexcept {
+        // Quick empty check with relaxed ordering (fastest path)
+        const size_t current_head = head_.load(std::memory_order_relaxed);
+
+        // Use relaxed first, then acquire only if needed
+        size_t tail = tail_.load(std::memory_order_relaxed);
+        if (current_head == tail) {
+            tail = tail_.load(std::memory_order_acquire);
+            if (current_head == tail) {
+                return 0;
+            }
+        }
+
+        // Calculate items to dequeue with minimal calculations
+        const size_t available = (tail - current_head) & INDEX_MASK;
+        const size_t to_copy = std::min(available, max_items);
+
+        // Optimize based on whether the dequeue wraps around the buffer
+        const size_t first_chunk = std::min(to_copy, ActualCapacity - current_head);
+        const size_t second_chunk = to_copy - first_chunk;
+
+        // Prefetch the next cache lines ahead of time to reduce false sharing impact
+        if (first_chunk > 1) {
+            // Prefetch several cache lines ahead to minimize false sharing effects
+            for (size_t i = 0; i < std::min(first_chunk, size_t(4)); i++) {
+                detail::prefetch_read(&buffer_[current_head + i], 3);
+            }
+        }
+
+        // Use the fastest copy method based on type and iterator
+        if constexpr (std::is_trivially_copyable_v<T>) {
+            if constexpr (std::is_pointer_v<OutputIt> && std::is_same_v<std::remove_pointer_t<OutputIt>, T>) {
+                // Pointer to same type - use SIMD-optimized memory transfer
+                // Use SIMD for first chunk
+                detail::simd_memcpy(dest, &buffer_[current_head], first_chunk);
+
+                // Handle wrap-around if needed with SIMD
+                if (second_chunk > 0) {
+                    detail::simd_memcpy(dest + first_chunk, &buffer_[0], second_chunk);
+                }
+            } else {
+                // Other iterator type - use iterator operations
+                std::copy_n(&buffer_[current_head], first_chunk, dest);
+
+                if (second_chunk > 0) {
+                    auto advanced_dest = dest;
+                    std::advance(advanced_dest, first_chunk);
+                    std::copy_n(&buffer_[0], second_chunk, advanced_dest);
+                }
+            }
+        } else {
+            // Non-trivial type - use move semantics
+            for (size_t i = 0; i < first_chunk; i++) {
+                *dest = std::move(buffer_[current_head + i]);
+                ++dest;
+
+                if constexpr (!detail::unsafe_to_destroy_after_move_v<T>) {
+                    buffer_[current_head + i].~T();
+                }
+            }
+
+            for (size_t i = 0; i < second_chunk; i++) {
+                *dest = std::move(buffer_[i]);
+                ++dest;
+
+                if constexpr (!detail::unsafe_to_destroy_after_move_v<T>) {
+                    buffer_[i].~T();
+                }
+            }
+        }
+
+        // Update head position with a single atomic operation
+        head_.store((current_head + to_copy) & INDEX_MASK, std::memory_order_release);
+
+        // Only notify if we freed substantial space
+        if (available == to_copy || to_copy > ActualCapacity / 4) {
+            blocking_.not_full_.notify_one();
+        }
+
+        return to_copy;
+    }
+
+    /**
+     * @brief Dequeues multiple elements
+     *
+     * Attempts to dequeue multiple elements.
+     *
+     * @tparam OutputIt Iterator type for destination
+     * @param dest Iterator to the destination to store dequeued elements
+     * @param max_items Maximum number of elements to dequeue
+     * @param stoken Stop token for cancellation
+     * @return Number of elements successfully dequeued
+     */
+    template<typename OutputIt>
+    size_t dequeue_bulk(OutputIt dest, size_t max_items, std::stop_token stoken = {}) {
+        if (max_items == 0)
+            return 0;
+
+        // Try non-blocking fast path first
+        size_t items_dequeued = try_dequeue_bulk(dest, max_items);
+        if (items_dequeued == max_items) {
+            return items_dequeued;
+        }
+
+        // If we got some items but not all, advance the destination iterator
+        if (items_dequeued > 0) {
+            std::advance(dest, items_dequeued);
+            max_items -= items_dequeued;
+        }
+
+        // Spin with exponential backoff for a short time
+        auto start_time = std::chrono::steady_clock::now();
+        auto spin_time = std::chrono::microseconds(200);
+        auto spin_end_time = start_time + spin_time;
+
+        detail::exponential_backoff backoff;
+        while (items_dequeued < max_items && std::chrono::steady_clock::now() < spin_end_time) {
+            size_t batch_dequeued = try_dequeue_bulk(dest, max_items);
+            if (batch_dequeued > 0) {
+                std::advance(dest, batch_dequeued);
+                items_dequeued += batch_dequeued;
+                max_items -= batch_dequeued;
+
+                if (max_items == 0) {
+                    return items_dequeued;
+                }
+            }
+            backoff();
+        }
+
+        if (stoken.stop_requested()) {
+            return items_dequeued;
+        }
+
+        // Fall back to condition variable waiting
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        do {
+            // Wait until items are available or timeout
+            while (!blocking_.not_empty_.wait_for(lock, 2000ms, [this] { return !is_empty() || !blocking_.is_active_.load(std::memory_order_acquire); })) {
+                if (stoken.stop_requested()) {
+                    return false;
+                }
+            }
+
+            if (!blocking_.is_active_.load(std::memory_order_acquire)) {
+                break;  // Queue was shut down
+            }
+
+            // Release lock during actual dequeue operation
+            lock.unlock();
+            size_t batch_dequeued = try_dequeue_bulk(dest, max_items);
+            lock.lock();
+
+            if (batch_dequeued > 0) {
+                std::advance(dest, batch_dequeued);
+                items_dequeued += batch_dequeued;
+                max_items -= batch_dequeued;
+
+                if (max_items == 0) {
+                    break;  // All items dequeued
+                }
+            }
+
+        } while (max_items > 0 && !stoken.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire));
+
+        return items_dequeued;
+    }
+
+    /**
+     * @brief Dequeues multiple elements with timeout
+     *
+     * Attempts to dequeue multiple elements, waiting up to the specified timeout.
+     *
+     * @tparam OutputIt Iterator type for destination
+     * @tparam Rep Duration representation type
+     * @tparam Period Duration period type
+     * @param dest Iterator to the destination to store dequeued elements
+     * @param max_items Maximum number of elements to dequeue
+     * @param timeout Maximum time to wait
+     * @return Number of elements successfully dequeued
+     */
+    template<typename OutputIt, typename Rep, typename Period>
+    size_t dequeue_bulk_for(OutputIt dest, size_t max_items, const std::chrono::duration<Rep, Period>& timeout) {
+        if (max_items == 0)
+            return 0;
+
+        // Track start time for timeout
+        auto start_time = std::chrono::steady_clock::now();
+        auto end_time = start_time + timeout;
+
+        // Try non-blocking fast path first
+        size_t items_dequeued = try_dequeue_bulk(dest, max_items);
+        if (items_dequeued == max_items) {
+            return items_dequeued;
+        }
+
+        // If we got some items but not all, advance the destination iterator
+        if (items_dequeued > 0) {
+            std::advance(dest, items_dequeued);
+            max_items -= items_dequeued;
+        }
+
+        // Spin with exponential backoff for a short time
+        auto timeout_us = std::chrono::duration_cast<std::chrono::microseconds>(timeout);
+        auto spin_time = std::min(timeout_us / 5, std::chrono::microseconds(200));
+        auto spin_end_time = start_time + spin_time;
+
+        detail::exponential_backoff backoff;
+        while (items_dequeued < max_items && std::chrono::steady_clock::now() < spin_end_time) {
+            size_t batch_dequeued = try_dequeue_bulk(dest, max_items);
+            if (batch_dequeued > 0) {
+                std::advance(dest, batch_dequeued);
+                items_dequeued += batch_dequeued;
+                max_items -= batch_dequeued;
+
+                if (max_items == 0) {
+                    return items_dequeued;
+                }
+            }
+            backoff();
+        }
+
+        // Check if timeout expired during spinning
+        if (std::chrono::steady_clock::now() >= end_time) {
+            return items_dequeued;
+        }
+
+        // Fall back to condition variable waiting
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        do {
+            // Wait until items are available or timeout
+            if (!blocking_.not_empty_.wait_until(lock, end_time, [this] { return !is_empty() || !blocking_.is_active_.load(std::memory_order_acquire); })) {
+                break;  // Timeout occurred
+            }
+
+            if (!blocking_.is_active_.load(std::memory_order_acquire)) {
+                break;  // Queue was shut down
+            }
+
+            // Release lock during actual dequeue operation
+            lock.unlock();
+            size_t batch_dequeued = try_dequeue_bulk(dest, max_items);
+            lock.lock();
+
+            if (batch_dequeued > 0) {
+                std::advance(dest, batch_dequeued);
+                items_dequeued += batch_dequeued;
+                max_items -= batch_dequeued;
+
+                if (max_items == 0) {
+                    break;  // All items dequeued
+                }
+            }
+
+        } while (max_items > 0 && std::chrono::steady_clock::now() < end_time && blocking_.is_active_.load(std::memory_order_acquire));
+
+        return items_dequeued;
+    }
+
+    /**
+     * @brief Dequeues any available elements with timeout
+     *
+     * Attempts to dequeue elements, returning as soon as any are available
+     * or the timeout expires.
+     *
+     * @tparam OutputIt Iterator type for destination
+     * @tparam Rep Duration representation type
+     * @tparam Period Duration period type
+     * @param dest Iterator to the destination to store dequeued elements
+     * @param max_items Maximum number of elements to dequeue
+     * @param timeout Maximum time to wait
+     * @return Number of elements successfully dequeued
+     */
+    template<typename OutputIt, typename Rep, typename Period>
+    size_t dequeue_bulk_for_any(OutputIt dest, size_t max_items, const std::chrono::duration<Rep, Period>& timeout) {
+        if (max_items == 0)
+            return 0;
+
+        // Try non-blocking fast path first
+        size_t items_dequeued = try_dequeue_bulk(dest, max_items);
+        if (items_dequeued > 0) {
+            return items_dequeued;  // Return immediately if any items were dequeued
+        }
+
+        // Track time for timeout
+        auto start_time = std::chrono::steady_clock::now();
+        auto end_time = start_time + timeout;
+
+        // Spin with exponential backoff for a short time
+        auto timeout_us = std::chrono::duration_cast<std::chrono::microseconds>(timeout);
+        auto spin_time = std::min(timeout_us / 5, std::chrono::microseconds(100));
+        auto spin_end_time = start_time + spin_time;
+
+        detail::exponential_backoff backoff;
+        while (std::chrono::steady_clock::now() < spin_end_time) {
+            size_t batch_dequeued = try_dequeue_bulk(dest, max_items);
+            if (batch_dequeued > 0) {
+                return batch_dequeued;  // Return immediately with any items
+            }
+            backoff();
+        }
+
+        // Fall back to condition variable waiting
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        // Wait until any items are available, timeout, or queue inactive
+        bool has_items = blocking_.not_empty_.wait_until(lock, end_time, [this] { return !is_empty() || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // If no items or queue shut down, return 0
+        if (!has_items || !blocking_.is_active_.load(std::memory_order_acquire)) {
+            return 0;
+        }
+
+        // Try to dequeue with lock released
+        lock.unlock();
+        return try_dequeue_bulk(dest, max_items);
+    }
+
+    //////////EMPLACE OPERATIONS//////////
+
+    /**
+     * @brief Attempts to construct an element in-place in the queue
+     *
+     * Non-blocking operation that attempts to construct an element
+     * directly in the queue's buffer.
+     *
+     * @tparam Args Types of arguments to forward to the constructor
+     * @param args Arguments to forward to the constructor
+     * @return true if successful, false if the queue was full
+     */
+    template<typename... Args>
+    bool try_emplace(Args&&... args) noexcept(std::is_nothrow_constructible_v<T, Args...>) {
+        const size_t current_tail = tail_.load(std::memory_order_relaxed);
+        const size_t next_tail = (current_tail + 1) & INDEX_MASK;
+
+        // Optimization: Relaxed load first, then acquire if needed
+        if (next_tail == head_.load(std::memory_order_relaxed)) {
+            // Double-check with acquire semantics
+            if (next_tail == head_.load(std::memory_order_acquire))
+                return false;  // Queue is full
+        }
+
+        // Optimization: Prefetch for write to reduce cache misses
+        detail::prefetch_write(&buffer_[current_tail]);
+
+        new (&buffer_[current_tail]) T(std::forward<Args>(args)...);
+        tail_.store(next_tail, std::memory_order_release);
+
+        // Notify if queue was empty
+        if (current_tail == head_.load(std::memory_order_relaxed))
+            blocking_.not_empty_.notify_one();
+
+        return true;
+    }
+
+    /**
+     * @brief Constructs an element in-place in the queue, blocking if necessary
+     *
+     * Blocks the calling thread until space is available in the queue.
+     *
+     * @tparam Args Types of arguments to forward to the constructor
+     * @param args Arguments to forward to the constructor
+     */
+    template<typename... Args>
+    void emplace(Args&&... args) {
+        // Try fast path first
+        if (try_emplace(std::forward<Args>(args)...))
+            return;
+
+        // Slow path with blocking
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+        blocking_.not_full_.wait(lock, [this, &args...] { return try_emplace(std::forward<Args>(args)...) || !blocking_.is_active_.load(std::memory_order_acquire); });
+    }
+
+    /**
+     * @brief Constructs an element in-place with cancellation support
+     *
+     * Blocks until space is available or the operation is cancelled.
+     *
+     * @tparam StopToken Type meeting the StopToken concept
+     * @tparam Args Types of arguments to forward to the constructor
+     * @param stop_token Token that can be used to cancel the operation
+     * @param args Arguments to forward to the constructor
+     * @return true if the element was emplaced, false if cancelled
+     */
+    template<typename StopToken, typename... Args>
+    bool emplace(StopToken&& stop_token, Args&&... args) {
+        // Try fast path first
+        if (try_emplace(std::forward<Args>(args)...))
+            return true;
+
+        // Slow path with blocking and cancellation support
+        std::unique_lock<std::mutex> lock(blocking_.mutex_);
+
+        // Wait until space available, queue inactive, or stop requested
+        std::condition_variable_any{}.wait(lock, stop_token, [this, &args...] { return try_emplace(std::forward<Args>(args)...) || !blocking_.is_active_.load(std::memory_order_acquire); });
+
+        // Check if emplace succeeded or stopped
+        return !stop_token.stop_requested() && blocking_.is_active_.load(std::memory_order_acquire);
+    }
+
+    /////////////UTILITY METHODS//////////
+
+    /**
+     * @brief Checks if the queue is empty
+     *
+     * @return true if the queue is empty, false otherwise
+     */
+    bool is_empty() const noexcept { return head_.load(std::memory_order_relaxed) == tail_.load(std::memory_order_relaxed); }
+
+    /**
+     * @brief Checks if the queue is full
+     *
+     * @return true if the queue is full, false otherwise
+     */
+    bool is_full() const noexcept {
+        const size_t next_tail = (tail_.load(std::memory_order_relaxed) + 1) & INDEX_MASK;
+        return next_tail == head_.load(std::memory_order_relaxed);
+    }
+
+    /**
+     * @brief Checks if the queue is nearly empty
+     *
+     * Useful for making decisions about throttling or batching.
+     *
+     * @return true if the queue is less than 1/8 full, false otherwise
+     */
+    bool is_almost_empty() const noexcept {
+        const size_t head = head_.load(std::memory_order_relaxed);
+        const size_t tail = tail_.load(std::memory_order_relaxed);
+        const size_t size = (tail - head) & INDEX_MASK;
+        return size < ActualCapacity / 8;
+    }
+
+    /**
+     * @brief Checks if the queue is nearly full
+     *
+     * Useful for making decisions about throttling or batching.
+     *
+     * @return true if the queue is more than 7/8 full, false otherwise
+     */
+    bool is_almost_full() const noexcept {
+        const size_t head = head_.load(std::memory_order_relaxed);
+        const size_t tail = tail_.load(std::memory_order_relaxed);
+        const size_t free = ((head - tail - 1) & INDEX_MASK);
+        return free < ActualCapacity / 8;
+    }
+
+    /**
+     * @brief Gets the current number of elements in the queue
+     *
+     * @return Current size of the queue
+     */
+    size_t size() const noexcept { return (tail_.load(std::memory_order_relaxed) - head_.load(std::memory_order_relaxed)) & INDEX_MASK; }
+
+    /**
+     * @brief Gets the capacity of the queue
+     *
+     * Returns the actual usable capacity, which is one less than
+     * the internal buffer size due to the need to distinguish
+     * between empty and full states.
+     *
+     * @return Maximum number of elements the queue can hold
+     */
+    constexpr size_t capacity() const noexcept {
+        return ActualCapacity - 1;  // One slot is always kept empty
+    }
+
+    /**
+     * @brief Gets the requested capacity from construction
+     *
+     * @return The minimum capacity requested when the queue was created
+     */
+    constexpr size_t requested_capacity() const noexcept { return RequestedCapacity; }
+
+    /**
+     * @brief Gets the actual capacity after power-of-2 rounding
+     *
+     * @return The actual capacity of the queue
+     */
+    constexpr size_t actual_capacity() const noexcept { return ActualCapacity - 1; }
+
+    /**
+     * @brief Shuts down the queue
+     *
+     * Wakes up all waiting threads and marks the queue as inactive.
+     * No new blocking operations will succeed after shutdown.
+     */
+    void shutdown() noexcept {
+        blocking_.is_active_.store(false, std::memory_order_release);
+        blocking_.not_empty_.notify_all();
+        blocking_.not_full_.notify_all();
+    }
+
+    /**
+     * @brief Processes and removes all elements from the queue
+     *
+     * Efficiently drains the queue, passing each element to the provided consumer.
+     *
+     * @tparam Consumer Callable type that accepts T&&
+     * @param consumer Function or functor to process each dequeued element
+     * @return Number of elements processed
+     */
+    template<typename Consumer>
+    size_t drain_all(Consumer&& consumer) {
+        size_t count = 0;
+        T item;
+
+        // Fast path with bulk extraction when possible
+        if constexpr (std::is_trivially_copyable_v<T> && sizeof(T) <= 64) {
+            // For small trivial types, extract in batches for processing
+            constexpr size_t BATCH_SIZE = 16;
+            std::array<T, BATCH_SIZE> items;
+
+            while (true) {
+                size_t batch_count = try_dequeue_bulk(items.data(), BATCH_SIZE);
+                if (batch_count == 0)
+                    break;
+
+                for (size_t i = 0; i < batch_count; i++) {
+                    consumer(std::move(items[i]));
+                }
+
+                count += batch_count;
+            }
+        } else {
+            // For larger/non-trivial types, process one-by-one
+            while (try_dequeue(item)) {
+                consumer(std::move(item));
+                count++;
+            }
+        }
+
+        return count;
+    }
+};
+
+#endif  // SPSC_QUEUE_HPP
\ No newline at end of file
diff --git a/src/FINNCppDriver/utils/mdspan.h b/src/FINNCppDriver/utils/mdspan.h
deleted file mode 100644
index 0cfed21..0000000
--- a/src/FINNCppDriver/utils/mdspan.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * @file mdspan.h
- * @author Linus Jungemann (linus.jungemann@uni-paderborn.de) and others
- * @brief Provides the C++23 functionality of mdspan
- * @version 0.1
- * @date 2023-10-31
- *
- * @copyright Copyright (c) 2023
- * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
- *
- */
-
-#ifndef MDSPAN_H
-#define MDSPAN_H
-
-#include <span>
-
-#ifdef __cpp_lib_mdspan
-    #warning("Warning: Both std and stdex mdspan are enabled. stdex should be disabled as it is deprecated! ")
-#endif
-
-
-#include <mdspan/mdspan.hpp>
-namespace stdex = Kokkos;
-
-
-/**
- * @brief Implementation of makeMDSpan
- *
- * @tparam T Type of data stored in array
- * @tparam Array Type of array of dimensions
- * @tparam I Internal
- * @param data Pointer to underlying data array
- * @param a array of dimensions
- * @param u index sequence (unused)
- * @return auto mdspan
- */
-template<typename T, typename Array, std::size_t... I>
-auto makeMDSpanImpl(T* data, const Array& a, [[maybe_unused]] std::index_sequence<I...> u) {
-    return stdex::mdspan(data, a[I]...);
-}
-
-/**
- * @brief Constructs a mdspan
- *
- * @tparam T Type of data stored in array
- * @tparam N Number of dimensions (autodeduced)
- * @tparam Indices Index sequence for dimension array (autodeduced)
- * @param data Pointer to underlying data array
- * @param list
- * @return auto mdspan
- */
-template<typename T, std::size_t N, typename Indices = std::make_index_sequence<N>>
-// NOLINTNEXTLINE
-auto makeMDSpan(T* data, const T (&list)[N]) {
-    return makeMDSpanImpl(data, list, Indices{});
-}
-
-#endif  // MDSPAN_H
diff --git a/unittests/core/BaseDriverTest.cpp b/unittests/core/BaseDriverTest.cpp
index 2613abc..3a26758 100644
--- a/unittests/core/BaseDriverTest.cpp
+++ b/unittests/core/BaseDriverTest.cpp
@@ -13,12 +13,12 @@
 
 #include <FINNCppDriver/config/FinnDriverUsedDatatypes.h>
 #include <FINNCppDriver/utils/FinnUtils.h>
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
 #include <FINNCppDriver/core/BaseDriver.hpp>
 #include <FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp>
 #include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <FINNCppDriver/utils/join.hpp>
 
 #include "gtest/gtest.h"
@@ -45,15 +45,13 @@ class BaseDriverTest : public ::testing::Test {
 class TestDriver : public Finn::Driver<true> {
      public:
     TestDriver(const Finn::Config& pConfig, unsigned int hostBufferSize) : Finn::Driver<true>(pConfig, hostBufferSize) {}
-    Finn::vector<uint8_t> inferR(const Finn::vector<uint8_t>& data, unsigned int inputDeviceIndex, const std::string& inputBufferKernelName, unsigned int outputDeviceIndex, const std::string& outputBufferKernelName, unsigned int samples,
-                                 bool forceArchival) {
-        return infer(data, inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, samples, forceArchival);
+    Finn::vector<uint8_t> inferR(const Finn::vector<uint8_t>& data, unsigned int inputDeviceIndex, const std::string& inputBufferKernelName, unsigned int outputDeviceIndex, const std::string& outputBufferKernelName, unsigned int samples) {
+        return infer(data, inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, samples);
     }
 
     template<typename IterType>
-    Finn::vector<uint8_t> inferR(IterType first, IterType last, unsigned int inputDeviceIndex, const std::string& inputBufferKernelName, unsigned int outputDeviceIndex, const std::string& outputBufferKernelName, unsigned int samples,
-                                 bool forceArchival) {
-        return infer(first, last, inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, samples, forceArchival);
+    Finn::vector<uint8_t> inferR(IterType first, IterType last, unsigned int inputDeviceIndex, const std::string& inputBufferKernelName, unsigned int outputDeviceIndex, const std::string& outputBufferKernelName, unsigned int samples) {
+        return infer(first, last, inputDeviceIndex, inputBufferKernelName, outputDeviceIndex, outputBufferKernelName, samples);
     }
 };
 
@@ -63,7 +61,7 @@ TEST_F(BaseDriverTest, BasicBaseDriverTest) {
 
     Finn::vector<uint8_t> data;
     Finn::vector<uint8_t> backupData;
-    data.resize(driver.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE, 0, inputDmaName));
+    data.resize(driver.getTotalDataSize(0, inputDmaName));
 
     filler.fillRandom(data.begin(), data.end());
     backupData = data;
@@ -72,9 +70,9 @@ TEST_F(BaseDriverTest, BasicBaseDriverTest) {
     driver.getDeviceHandler(0).getOutputBuffer(outputDmaName)->testSetMap(data);
 
     // Run inference
-    auto results = driver.inferR(data, 0, inputDmaName, 0, outputDmaName, hostBufferSize, 1);
+    auto results = driver.inferR(data, 0, inputDmaName, 0, outputDmaName, hostBufferSize);
 
-    Finn::vector<uint8_t> base(data.begin(), data.begin() + static_cast<long int>(driver.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE, 0, outputDmaName)));
+    Finn::vector<uint8_t> base(data.begin(), data.begin() + static_cast<long int>(driver.getTotalDataSize(0, outputDmaName)));
 
 
     // Checks: That input and output data is the same is just for convenience, in application this does not need to be
@@ -87,12 +85,12 @@ TEST_F(BaseDriverTest, BasicBaseDriverTest) {
 }
 
 TEST_F(BaseDriverTest, syncInferenceTest) {
-    auto driver = Finn::Driver<true>(unittestConfig, 0, inputDmaName, 0, outputDmaName, 1, true);
+    auto driver = Finn::Driver<true>(unittestConfig, 0, inputDmaName, 0, outputDmaName, 1);
 
     // The input has to be 4 times longer than the expected size of the FPGA, because uint8->int2 packing reduces size by factor 4
-    std::cout << driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, inputDmaName) << "\n";
+    std::cout << driver.getFeatureMapSize(0, inputDmaName) << "\n";
     Finn::vector<int8_t> data(300, 1);
-    Finn::vector<uint8_t> outdata(driver.size(SIZE_SPECIFIER::FEATUREMAP_SIZE, 0, outputDmaName), 1);
+    Finn::vector<uint8_t> outdata(driver.getFeatureMapSize(0, outputDmaName), 1);
 
     // Setup fake output data
     driver.getDeviceHandler(0).getOutputBuffer(outputDmaName)->testSetMap(outdata);
@@ -100,7 +98,7 @@ TEST_F(BaseDriverTest, syncInferenceTest) {
     // Run inference
     auto results = driver.inferSynchronous(data.begin(), data.end());
 
-    Finn::vector<uint8_t> expected(driver.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE, 0, outputDmaName), 1);
+    Finn::vector<uint8_t> expected(driver.getTotalDataSize(0, outputDmaName), 1);
 
     EXPECT_EQ(results, expected);
 }
diff --git a/unittests/core/CMakeLists.txt b/unittests/core/CMakeLists.txt
index 9662b4e..5d32fa7 100644
--- a/unittests/core/CMakeLists.txt
+++ b/unittests/core/CMakeLists.txt
@@ -1,4 +1,3 @@
 add_unittest(DeviceHandlerTest.cpp)
-add_unittest(RingBufferTest.cpp)
 add_unittest(DeviceBufferTest.cpp)
 add_unittest(BaseDriverTest.cpp)
\ No newline at end of file
diff --git a/unittests/core/DeviceBufferTest.cpp b/unittests/core/DeviceBufferTest.cpp
index 31f7c71..78d0510 100644
--- a/unittests/core/DeviceBufferTest.cpp
+++ b/unittests/core/DeviceBufferTest.cpp
@@ -10,11 +10,10 @@
  *
  */
 
-#include <FINNCppDriver/utils/Logger.h>
-
 #include <FINNCppDriver/core/DeviceBuffer/AsyncDeviceBuffers.hpp>
 #include <FINNCppDriver/core/DeviceBuffer/SyncDeviceBuffers.hpp>
 #include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <memory>
 #include <random>
 #include <span>
@@ -38,7 +37,7 @@ class DBTest : public ::testing::Test {
 
 TEST_F(DBTest, DBStoreTest) {
     Finn::SyncDeviceInputBuffer<uint8_t> buffer("InputBuffer", device, uuid, FinnUnittest::myShapePacked, FinnUnittest::parts);
-    Finn::vector<uint8_t> data(buffer.size(SIZE_SPECIFIER::FEATUREMAP_SIZE) * buffer.size(SIZE_SPECIFIER::BATCHSIZE));
+    Finn::vector<uint8_t> data(buffer.getFeatureMapSize() * buffer.getBatchSize());
     FinnUtils::BufferFiller(0, 255).fillRandom(data.begin(), data.end());
     buffer.store({data.begin(), data.end()});
     EXPECT_EQ(buffer.testGetMap(), data);
@@ -46,7 +45,7 @@ TEST_F(DBTest, DBStoreTest) {
 
 TEST_F(DBTest, DBOutputTest) {
     Finn::SyncDeviceOutputBuffer<uint8_t> buffer("OutputBuffer", device, uuid, FinnUnittest::myShapePacked, FinnUnittest::parts);
-    Finn::vector<uint8_t> data(buffer.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE));
+    Finn::vector<uint8_t> data(buffer.getTotalDataSize());
     FinnUtils::BufferFiller(0, 255).fillRandom(data.begin(), data.end());
     buffer.testSetMap(data);
     buffer.read();
diff --git a/unittests/core/RingBufferTest.cpp b/unittests/core/RingBufferTest.cpp
deleted file mode 100644
index e66c7e7..0000000
--- a/unittests/core/RingBufferTest.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * @file RingBufferTest.cpp
- * @author Bjarne Wintermann (bjarne.wintermann@uni-paderborn.de) and others
- * @brief Unittest for the Ring Buffer
- * @version 0.1
- * @date 2023-10-31
- *
- * @copyright Copyright (c) 2023
- * @license All rights reserved. This program and the accompanying materials are made available under the terms of the MIT license.
- *
- */
-
-#include <FINNCppDriver/utils/FinnUtils.h>
-#include <FINNCppDriver/utils/Logger.h>
-
-#include <FINNCppDriver/utils/RingBuffer.hpp>
-#include <algorithm>
-#include <functional>
-#include <random>
-#include <thread>
-#include <vector>
-
-#include "UnittestConfig.h"
-#include "gtest/gtest.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-// Globals
-using RB = Finn::RingBuffer<int, false>;
-const size_t parts = FinnUnittest::parts;
-// const size_t elementsPerPart = FinnUnittest::elementsPerPart;
-const size_t elementsPerPart = 5;
-
-
-class RBTest : public ::testing::Test {
-     protected:
-    RB rb = RB(parts, elementsPerPart);
-    Finn::vector<int> data;
-    std::vector<Finn::vector<int>> storedDatas;
-    FinnUtils::BufferFiller filler = FinnUtils::BufferFiller(0, 255);
-    void SetUp() override { data.resize(rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE)); }
-
-    /**
-     * @brief Utility function to completely fill a ringBuffer or a deviceinput/output buffer.
-     *
-     * This function uses the data vector to fill the entire rb of type T with random data, based on it's size.
-     * storedDatas gets all data used pushed back.
-     *
-     * @param fast Whether to use fast store methods (no mutex locking, no length checks)
-     * @param ref Whether to use references (true) or iterators (false)
-     */
-    void fillCompletely(bool ref) {
-        for (size_t i = 0; i < rb.size(SIZE_SPECIFIER::BATCHSIZE); i++) {
-            filler.fillRandom(data.begin(), data.end());
-            storedDatas.push_back(data);
-
-            if (ref) {
-                EXPECT_TRUE(rb.store(data.begin(), data.size()));
-            } else {
-                EXPECT_TRUE(rb.store(data.begin(), data.end()));
-            }
-        }
-    }
-
-    void TearDown() override {}
-};
-
-class RBTestBlocking : public ::testing::Test {
-     protected:
-    Finn::RingBuffer<int, true> rb = Finn::RingBuffer<int, true>(parts, elementsPerPart);
-    Finn::vector<int> data;
-    std::vector<Finn::vector<int>> storedDatas;
-    FinnUtils::BufferFiller filler = FinnUtils::BufferFiller(0, 255);
-    void SetUp() override { data.resize(rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE)); }
-
-    /**
-     * @brief Utility function to completely fill a ringBuffer or a deviceinput/output buffer.
-     *
-     * This function uses the data vector to fill the entire rb of type T with random data, based on it's size.
-     * storedDatas gets all data used pushed back.
-     *
-     * @param fast Whether to use fast store methods (no mutex locking, no length checks)
-     * @param ref Whether to use references (true) or iterators (false)
-     */
-    void fillCompletely(bool ref) {
-        for (size_t i = 0; i < rb.size(SIZE_SPECIFIER::BATCHSIZE); i++) {
-            filler.fillRandom(data.begin(), data.end());
-            storedDatas.push_back(data);
-
-            if (ref) {
-                EXPECT_TRUE(rb.store(data.begin(), data.size()));
-            } else {
-                EXPECT_TRUE(rb.store(data.begin(), data.end()));
-            }
-        }
-    }
-
-    void TearDown() override {}
-};
-
-
-TEST(RBTestManual, RBInitTest) {
-    auto rb = RB(parts, elementsPerPart);
-
-    // Pointers
-    EXPECT_TRUE(rb.empty());
-
-    // Sizes
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::BATCHSIZE), parts);
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE), elementsPerPart);
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::BYTES), parts * elementsPerPart * sizeof(int));
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE), parts * elementsPerPart);
-
-    // Initial values
-    std::vector<int> out;
-    rb.readAllValidParts(std::back_inserter(out));
-    EXPECT_TRUE(out.empty());
-    EXPECT_FALSE(rb.full());
-}
-
-TEST_F(RBTest, RBStoreReadTestIterator) {
-    fillCompletely(false);
-
-    // Temporary save entries
-    std::vector<int> current;
-    rb.readWithoutInvalidation(std::back_inserter(current));
-
-    // Confirm that no new data can be stored until some data is read
-    filler.fillRandom(data.begin(), data.end());
-    EXPECT_FALSE(rb.store(data.begin(), data.end()));
-
-    // Test that the valid data was not changed
-    std::vector<int> after;
-    rb.readWithoutInvalidation(std::back_inserter(after));
-    EXPECT_EQ(after, current);
-
-    // Read two entries
-    std::size_t oldSize = rb.size();
-    int* buf = new int[elementsPerPart];
-    EXPECT_TRUE(rb.read(buf));
-    EXPECT_TRUE(rb.read(buf));
-
-    // Check size
-    EXPECT_EQ(rb.size(), oldSize - 2);
-    delete[] buf;
-}
-
-TEST_F(RBTestBlocking, RBFastStoreTestIterator) {
-    fillCompletely(true);
-
-    // Read two entries
-    std::size_t oldSize = rb.size();
-    int* buf = new int[elementsPerPart];
-    EXPECT_TRUE(rb.read(buf));
-    EXPECT_TRUE(rb.read(buf));
-
-    // Check size
-    EXPECT_EQ(rb.size(), oldSize - 2);
-    delete[] buf;
-}
-
-TEST_F(RBTest, RBStoreReadTestReference) {
-    fillCompletely(true);
-
-    // Temporary save entries
-    std::vector<int> current;
-    rb.readWithoutInvalidation(std::back_inserter(current));
-
-    // Confirm that no new data can be stored until some data is read
-    filler.fillRandom(data.begin(), data.end());
-    EXPECT_FALSE(rb.store(data.begin(), data.end()));
-
-    // Test that the valid data was not changed
-    std::vector<int> after;
-    rb.readWithoutInvalidation(std::back_inserter(after));
-    EXPECT_EQ(after, current);
-
-    // Read two entries
-    std::size_t oldSize = rb.size();
-    int* buf = new int[elementsPerPart];
-    EXPECT_TRUE(rb.read(buf));
-    EXPECT_TRUE(rb.read(buf));
-
-    // Check size
-    EXPECT_EQ(rb.size(), oldSize - 2);
-    delete[] buf;
-}
-
-TEST_F(RBTest, RBReadTest) {
-    //* Requires that the store tests ran successfully to be successfull itself
-    fillCompletely(true);
-
-    // Check that the read data is equivalent to the saved data and read in the same order (important!)
-    for (unsigned int i = 0; i < rb.size(SIZE_SPECIFIER::BATCHSIZE); i++) {
-        EXPECT_TRUE(rb.read(data.begin()));
-        EXPECT_EQ(storedDatas[i], data);
-    }
-}
-
-TEST_F(RBTest, RBReadTestArray) {
-    //* Requires that the store tests ran successfully to be successfull itself
-    fillCompletely(true);
-
-    // Check that the read data is equivalent to the saved data and read in the same order (important!)
-    int* buf = new int[rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE)];
-    for (unsigned int i = 0; i < rb.size(SIZE_SPECIFIER::BATCHSIZE); i++) {
-        EXPECT_TRUE(rb.read(buf));
-        for (unsigned int j = 0; j < rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE); j++) {
-            EXPECT_EQ(storedDatas[i][j], buf[j]);
-        }
-        break;
-    }
-    delete[] buf;
-}
-
-TEST_F(RBTest, RBUtilFuncsTest) {
-    // Check all sizes
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::BATCHSIZE), parts);
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::FEATUREMAP_SIZE), elementsPerPart);
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::BYTES), elementsPerPart * sizeof(int) * parts);
-    EXPECT_EQ(rb.size(SIZE_SPECIFIER::TOTAL_DATA_SIZE), elementsPerPart * parts);
-
-    // Check validity flags
-    fillCompletely(true);
-    EXPECT_TRUE(rb.full());
-    EXPECT_EQ(rb.size(), rb.size(SIZE_SPECIFIER::BATCHSIZE));
-
-    EXPECT_TRUE(rb.read(data.begin()));
-    EXPECT_FALSE(rb.full());
-    EXPECT_EQ(rb.size(), rb.size(SIZE_SPECIFIER::BATCHSIZE) - 1);
-}
-
-int main(int argc, char** argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-}
\ No newline at end of file
diff --git a/unittests/core/UnittestConfig.h b/unittests/core/UnittestConfig.h
index ed70428..ac8d6a7 100644
--- a/unittests/core/UnittestConfig.h
+++ b/unittests/core/UnittestConfig.h
@@ -12,10 +12,10 @@
 
 #include <FINNCppDriver/utils/ConfigurationStructs.h>
 #include <FINNCppDriver/utils/FinnUtils.h>
-#include <FINNCppDriver/utils/Logger.h>
 #include <FINNCppDriver/utils/Types.h>
 
 #include <FINNCppDriver/utils/FinnDatatypes.hpp>
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <array>
 #include <filesystem>
 #include <memory>
diff --git a/unittests/utils/CMakeLists.txt b/unittests/utils/CMakeLists.txt
index 69dafa3..13277b2 100644
--- a/unittests/utils/CMakeLists.txt
+++ b/unittests/utils/CMakeLists.txt
@@ -3,3 +3,4 @@ add_unittest(DataPackingTest.cpp)
 add_unittest(CustomDynamicBitsetTest.cpp)
 add_unittest(DynamicMdSpanTest.cpp)
 add_unittest(DataFoldingTest.cpp)
+add_unittest(SPSCQueueTest.cpp)
diff --git a/unittests/utils/SPSCQueueTest.cpp b/unittests/utils/SPSCQueueTest.cpp
new file mode 100644
index 0000000..2844d44
--- /dev/null
+++ b/unittests/utils/SPSCQueueTest.cpp
@@ -0,0 +1,571 @@
+#include <FINNCppDriver/utils/SPSCQueue.hpp>
+#include <atomic>
+#include <chrono>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+// Basic tests for non-trivial type
+TEST(SPSCQueueTest, BasicOperations) {
+    SPSCQueue<std::string, 16> queue;
+
+    // Test empty state
+    EXPECT_TRUE(queue.is_empty());
+    EXPECT_FALSE(queue.is_full());
+    EXPECT_EQ(queue.size(), 0);
+    EXPECT_EQ(queue.capacity(), 15);  // One slot always kept empty
+
+    // Test actual capacity vs requested
+    EXPECT_EQ(queue.requested_capacity(), 16);
+    EXPECT_EQ(queue.actual_capacity(), 15);
+
+    // Test enqueue and size
+    EXPECT_TRUE(queue.try_enqueue("test1"));
+    EXPECT_FALSE(queue.is_empty());
+    EXPECT_EQ(queue.size(), 1);
+
+    // Test dequeue
+    std::string item;
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item, "test1");
+    EXPECT_TRUE(queue.is_empty());
+
+    // Test multiple items
+    EXPECT_TRUE(queue.try_enqueue("test2"));
+    EXPECT_TRUE(queue.try_enqueue("test3"));
+    EXPECT_EQ(queue.size(), 2);
+
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item, "test2");
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item, "test3");
+    EXPECT_TRUE(queue.is_empty());
+}
+
+// Test for trivially copyable type (using the specialization)
+TEST(SPSCQueueTest, TrivialTypeOperations) {
+    SPSCQueue<int, 16> queue;
+
+    EXPECT_TRUE(queue.try_enqueue(42));
+    EXPECT_EQ(queue.size(), 1);
+
+    int item = 0;
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item, 42);
+    EXPECT_TRUE(queue.is_empty());
+}
+
+// Test queue capacity and power-of-2 rounding
+TEST(SPSCQueueTest, CapacityRounding) {
+    // Test with non-power-of-2 capacity
+    SPSCQueue<int, 10> queue1;
+    EXPECT_EQ(queue1.requested_capacity(), 10);
+    EXPECT_EQ(queue1.actual_capacity(), 15);  // Rounded up to 16-1
+
+    // Test with power-of-2 capacity
+    SPSCQueue<int, 16> queue2;
+    EXPECT_EQ(queue2.requested_capacity(), 16);
+    EXPECT_EQ(queue2.actual_capacity(), 15);  // 16-1
+}
+
+// Test filling the queue to capacity
+TEST(SPSCQueueTest, FullQueue) {
+    SPSCQueue<int, 4> queue;  // Actual capacity: 3
+
+    EXPECT_TRUE(queue.try_enqueue(1));
+    EXPECT_TRUE(queue.try_enqueue(2));
+    EXPECT_TRUE(queue.try_enqueue(3));
+    EXPECT_TRUE(queue.is_full());        // Should be full now
+    EXPECT_FALSE(queue.try_enqueue(4));  // Should fail
+
+    int item;
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item, 1);
+    EXPECT_FALSE(queue.is_full());  // No longer full
+
+    // Can enqueue again
+    EXPECT_TRUE(queue.try_enqueue(4));
+}
+
+// Test wrap-around behavior
+TEST(SPSCQueueTest, WrapAround) {
+    SPSCQueue<int, 4> queue;  // Actual capacity: 3
+    std::vector<int> results;
+
+    // Fill and drain multiple times to force wrap-around
+    for (int cycle = 0; cycle < 3; cycle++) {
+        EXPECT_TRUE(queue.try_enqueue(cycle * 3 + 1));
+        EXPECT_TRUE(queue.try_enqueue(cycle * 3 + 2));
+        EXPECT_TRUE(queue.try_enqueue(cycle * 3 + 3));
+
+        int item;
+        EXPECT_TRUE(queue.try_dequeue(item));
+        results.push_back(item);
+        EXPECT_TRUE(queue.try_dequeue(item));
+        results.push_back(item);
+        EXPECT_TRUE(queue.try_dequeue(item));
+        results.push_back(item);
+    }
+
+    // Verify correct order
+    std::vector<int> expected = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    EXPECT_EQ(results, expected);
+}
+
+// Test move semantics
+TEST(SPSCQueueTest, MoveSemantics) {
+    SPSCQueue<std::unique_ptr<int>, 4> queue;
+
+    auto ptr1 = std::make_unique<int>(42);
+    auto ptr2 = std::make_unique<int>(43);
+
+    EXPECT_TRUE(queue.try_enqueue(std::move(ptr1)));
+    EXPECT_TRUE(queue.try_enqueue(std::move(ptr2)));
+
+    // Original pointers should be null after move
+    EXPECT_EQ(ptr1, nullptr);
+    EXPECT_EQ(ptr2, nullptr);
+
+    std::unique_ptr<int> result;
+    EXPECT_TRUE(queue.try_dequeue(result));
+    EXPECT_EQ(*result, 42);
+
+    EXPECT_TRUE(queue.try_dequeue(result));
+    EXPECT_EQ(*result, 43);
+}
+
+// Test emplace functionality
+TEST(SPSCQueueTest, Emplace) {
+    SPSCQueue<std::pair<int, std::string>, 4> queue;
+
+    EXPECT_TRUE(queue.try_emplace(1, "one"));
+    EXPECT_TRUE(queue.try_emplace(2, "two"));
+
+    std::pair<int, std::string> item;
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item.first, 1);
+    EXPECT_EQ(item.second, "one");
+
+    EXPECT_TRUE(queue.try_dequeue(item));
+    EXPECT_EQ(item.first, 2);
+    EXPECT_EQ(item.second, "two");
+}
+
+// Test bulk operations
+TEST(SPSCQueueTest, BulkDequeue) {
+    SPSCQueue<int, 16> queue;
+
+    // Enqueue several items
+    for (int i = 0; i < 10; i++) {
+        EXPECT_TRUE(queue.try_enqueue(i));
+    }
+
+    // Dequeue in bulk
+    std::vector<int> results(5);
+    size_t count = queue.try_dequeue_bulk(results.begin(), 5);
+
+    EXPECT_EQ(count, 5);
+    for (size_t i = 0; i < count; i++) {
+        EXPECT_EQ(results[i], static_cast<int>(i));
+    }
+
+    // Dequeue remaining items
+    count = queue.try_dequeue_bulk(results.begin(), 5);
+    EXPECT_EQ(count, 5);
+    for (size_t i = 0; i < count; i++) {
+        EXPECT_EQ(results[i], static_cast<int>(i + 5));
+    }
+
+    // Queue should be empty now
+    EXPECT_TRUE(queue.is_empty());
+}
+
+// Test blocking behavior with threads
+TEST(SPSCQueueTest, BlockingOperations) {
+    SPSCQueue<int, 4> queue;  // Actual capacity: 3
+    std::atomic<bool> producer_done{false};
+    std::atomic<bool> consumer_done{false};
+    std::vector<int> produced;
+    std::vector<int> consumed;
+
+    // Producer thread - will produce 10 items
+    std::thread producer([&queue, &producer_done, &produced]() {
+        for (int i = 0; i < 10; i++) {
+            produced.push_back(i);  // No mutex needed - only producer thread touches this
+            queue.enqueue(i);       // Blocking enqueue
+        }
+        producer_done.store(true, std::memory_order_release);
+    });
+
+    // Consumer thread - will consume all items
+    std::thread consumer([&queue, &producer_done, &consumer_done, &consumed]() {
+        while (true) {
+            int item;
+            // Use a timeout to avoid hanging indefinitely
+            if (queue.dequeue_for(item, std::chrono::milliseconds(100))) {
+                consumed.push_back(item);  // No mutex needed - only consumer thread touches this
+            } else {
+                // Check if we're done - if producer is done AND queue is empty
+                if (producer_done.load(std::memory_order_acquire) && queue.is_empty()) {
+                    break;
+                }
+                // If we timed out but aren't done, just try again
+            }
+        }
+        consumer_done.store(true, std::memory_order_release);
+    });
+
+    // Set a timeout for the entire test
+    auto start_time = std::chrono::steady_clock::now();
+    auto timeout = std::chrono::seconds(5);  // 5 second timeout should be more than enough
+
+    while (!consumer_done.load(std::memory_order_acquire)) {
+        if (std::chrono::steady_clock::now() - start_time > timeout) {
+            // Test is taking too long, likely deadlocked - force shutdown
+            queue.shutdown();
+            break;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+
+    producer.join();
+    consumer.join();
+
+    // Verify all items were produced and consumed in order
+    EXPECT_EQ(produced, consumed);
+    EXPECT_EQ(consumed.size(), 10);
+}
+
+// Test timed operations
+TEST(SPSCQueueTest, TimedOperations) {
+    SPSCQueue<int, 4> queue;
+
+    // Test timeout on empty queue
+    int item = -1;
+    auto start = std::chrono::steady_clock::now();
+    bool result = queue.dequeue_for(item, std::chrono::milliseconds(100));
+    auto end = std::chrono::steady_clock::now();
+
+    EXPECT_FALSE(result);
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    EXPECT_GE(duration, 90);  // Allow for some timing variation
+
+    // Test successful timed dequeue
+    queue.try_enqueue(42);
+    result = queue.dequeue_for(item, std::chrono::milliseconds(100));
+    EXPECT_TRUE(result);
+    EXPECT_EQ(item, 42);
+}
+
+// Test bulk timed operations
+TEST(SPSCQueueTest, BulkTimedOperations) {
+    SPSCQueue<int, 16> queue;
+
+    // Test timeout on empty queue
+    std::vector<int> results(5);
+    auto start = std::chrono::steady_clock::now();
+    size_t count = queue.dequeue_bulk_for(results.begin(), 5, std::chrono::milliseconds(100));
+    auto end = std::chrono::steady_clock::now();
+
+    EXPECT_EQ(count, 0);
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    EXPECT_GE(duration, 90);  // Allow for some timing variation
+
+    // Test with some items
+    for (int i = 0; i < 3; i++) {
+        queue.try_enqueue(i);
+    }
+
+    count = queue.dequeue_bulk_for(results.begin(), 5, std::chrono::milliseconds(100));
+    EXPECT_EQ(count, 3);
+    for (size_t i = 0; i < count; i++) {
+        EXPECT_EQ(results[i], static_cast<int>(i));
+    }
+}
+
+// Test dequeue_bulk_for_any
+TEST(SPSCQueueTest, BulkTimedAnyOperations) {
+    SPSCQueue<int, 16> queue;
+
+    // Test timeout on empty queue
+    std::vector<int> results(5);
+    auto start = std::chrono::steady_clock::now();
+    size_t count = queue.dequeue_bulk_for_any(results.begin(), 5, std::chrono::milliseconds(100));
+    auto end = std::chrono::steady_clock::now();
+
+    EXPECT_EQ(count, 0);
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    EXPECT_GE(duration, 90);  // Allow for some timing variation
+
+    // Test with some items, added gradually
+    std::thread producer([&queue]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        queue.try_enqueue(42);
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        queue.try_enqueue(43);
+        queue.try_enqueue(44);
+    });
+
+    results.assign(5, 0);
+    count = queue.dequeue_bulk_for_any(results.begin(), 5, std::chrono::milliseconds(200));
+
+    EXPECT_GE(count, 1);  // Should get at least the first item
+    EXPECT_EQ(results[0], 42);
+
+    if (count > 1) {
+        EXPECT_EQ(results[1], 43);
+    }
+
+    producer.join();
+
+    // Cleanup any remaining items
+    queue.try_dequeue_bulk(results.begin(), 5);
+}
+
+// Test shutdown behavior
+TEST(SPSCQueueTest, Shutdown) {
+    SPSCQueue<int, 4> queue;
+    std::atomic<bool> consumer_unblocked{false};
+
+    // Start a consumer thread that will block
+    std::thread consumer([&queue, &consumer_unblocked]() {
+        int item;
+        bool result = queue.dequeue(item);  // This should block
+        EXPECT_FALSE(result);               // After shutdown, should return false
+        consumer_unblocked = true;
+    });
+
+    // Give the consumer time to block
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+    // Shutdown the queue
+    queue.shutdown();
+
+    // Consumer should unblock
+    consumer.join();
+    EXPECT_TRUE(consumer_unblocked);
+
+    // After shutdown, operations should fail
+    int item;
+    EXPECT_FALSE(queue.dequeue(item));
+    EXPECT_FALSE(queue.dequeue_for(item, std::chrono::milliseconds(1)));
+}
+
+// Test concurrent enqueue/dequeue with high throughput
+TEST(SPSCQueueTest, ConcurrentThroughput) {
+    for (size_t runs = 0; runs < 1000; ++runs) {
+        constexpr size_t ITEM_COUNT = 1000000;
+        SPSCQueue<uint64_t, 1024> queue;
+        std::atomic<bool> error{false};
+        std::atomic<bool> producer_done{false};
+
+        std::thread producer([&queue, &error, &producer_done]() {
+            try {
+                for (uint64_t i = 0; i < ITEM_COUNT; i++) {
+                    // Use non-blocking enqueue with retry
+                    while (!queue.try_enqueue(i) && !error.load(std::memory_order_relaxed)) {
+                        std::this_thread::yield();  // Give consumer time to catch up
+                    }
+
+                    // Exit early if consumer detected an error
+                    if (error.load(std::memory_order_relaxed)) {
+                        break;
+                    }
+                }
+            } catch (...) { error = true; }
+            producer_done.store(true, std::memory_order_release);
+        });
+
+        std::thread consumer([&queue, &error, &producer_done]() {
+            try {
+                uint64_t expected = 0;
+                while (expected < ITEM_COUNT && !error.load(std::memory_order_relaxed)) {
+                    uint64_t item;
+
+                    // Use timeout-based dequeue or non-blocking with yield
+                    if (queue.try_dequeue(item)) {
+                        if (item != expected) {
+                            error = true;
+                            break;
+                        }
+                        expected++;
+                    } else if (producer_done.load(std::memory_order_acquire)) {
+                        // If producer is done and no more items, we're missing items
+                        if (expected < ITEM_COUNT) {
+                            error = true;
+                        }
+                        break;
+                    } else {
+                        std::this_thread::yield();  // Give producer time to produce
+                    }
+                }
+            } catch (...) { error = true; }
+        });
+
+        // Set timeout for test to avoid hanging forever
+        auto start_time = std::chrono::steady_clock::now();
+        bool joined = false;
+
+        // Try to join with timeout
+        while (!joined && std::chrono::steady_clock::now() - start_time < std::chrono::seconds(30)) {
+            producer.join();
+            consumer.join();
+            joined = true;
+        }
+
+        // If not joined within timeout, consider it a deadlock
+        EXPECT_TRUE(joined) << "Test timed out - likely deadlock";
+        EXPECT_FALSE(error) << "Data corruption or missing items detected";
+    }
+}
+
+// Test bulk enqueue operations
+TEST(SPSCQueueTest, BulkEnqueue) {
+    SPSCQueue<int, 16> queue;  // Actual capacity: 15
+
+    // Test basic bulk enqueue
+    std::vector<int> items1 = {1, 2, 3, 4, 5};
+    size_t enqueued = queue.try_enqueue_bulk(items1.begin(), items1.size());
+
+    EXPECT_EQ(enqueued, 5);
+    EXPECT_EQ(queue.size(), 5);
+
+    // Test partial enqueue (not enough space)
+    std::vector<int> items2(12, 42);  // 12 items with value 42
+    enqueued = queue.try_enqueue_bulk(items2.begin(), items2.size());
+
+    EXPECT_EQ(enqueued, 10);  // Only 10 more should fit (15 - 5 already in queue)
+    EXPECT_EQ(queue.size(), 15);
+    EXPECT_TRUE(queue.is_full());
+
+    // Dequeue and verify values
+    std::vector<int> results(15);
+    size_t dequeued = queue.try_dequeue_bulk(results.begin(), 15);
+
+    EXPECT_EQ(dequeued, 15);
+    EXPECT_EQ(results[0], 1);  // First batch
+    EXPECT_EQ(results[4], 5);
+    EXPECT_EQ(results[5], 42);  // Second batch
+    EXPECT_EQ(results[14], 42);
+
+    // Test enqueue with empty queue
+    std::vector<int> items3 = {10, 20, 30};
+    enqueued = queue.try_enqueue_bulk(items3.begin(), items3.size());
+
+    EXPECT_EQ(enqueued, 3);
+
+    // Test wrap-around behavior
+    queue.try_dequeue_bulk(results.begin(), 1);  // Remove one item
+
+    std::vector<int> items4(14, 99);  // Try to add 14 items
+    enqueued = queue.try_enqueue_bulk(items4.begin(), items4.size());
+
+    EXPECT_EQ(enqueued, 13);  // Should fit 13 more (capacity 15 - 2 already there)
+
+    // Dequeue all and verify
+    dequeued = queue.try_dequeue_bulk(results.begin(), 15);
+
+    EXPECT_EQ(dequeued, 15);
+    EXPECT_EQ(results[0], 20);
+    EXPECT_EQ(results[1], 30);
+    EXPECT_EQ(results[2], 99);
+    EXPECT_EQ(results[14], 99);
+}
+
+// Test blocking bulk enqueue
+TEST(SPSCQueueTest, BlockingBulkEnqueue) {
+    SPSCQueue<int, 8> queue;  // Actual capacity: 7
+    std::atomic<bool> producer_done{false};
+    std::atomic<bool> consumer_done{false};
+    std::vector<int> all_produced;
+    std::vector<int> all_consumed;
+
+    // Producer thread - will produce 20 items in batches
+    std::thread producer([&queue, &producer_done, &all_produced]() {
+        std::vector<int> batch1 = {1, 2, 3, 4, 5};
+        std::vector<int> batch2 = {6, 7, 8, 9, 10};
+        std::vector<int> batch3 = {11, 12, 13, 14, 15};
+        std::vector<int> batch4 = {16, 17, 18, 19, 20};
+
+        // Add all items to the produced vector
+        all_produced.insert(all_produced.end(), batch1.begin(), batch1.end());
+        all_produced.insert(all_produced.end(), batch2.begin(), batch2.end());
+        all_produced.insert(all_produced.end(), batch3.begin(), batch3.end());
+        all_produced.insert(all_produced.end(), batch4.begin(), batch4.end());
+
+        // Enqueue batches with blocking behavior
+        queue.enqueue_bulk(batch1.begin(), batch1.size());
+        queue.enqueue_bulk(batch2.begin(), batch2.size());
+        queue.enqueue_bulk(batch3.begin(), batch3.size());
+        queue.enqueue_bulk(batch4.begin(), batch4.size());
+
+        producer_done = true;
+    });
+
+    // Consumer thread - will consume all items
+    std::thread consumer([&queue, &producer_done, &consumer_done, &all_consumed]() {
+        std::vector<int> results(3);  // Small buffer to force multiple dequeues
+
+        while (!producer_done || !queue.is_empty()) {
+            size_t dequeued = queue.try_dequeue_bulk(results.begin(), results.size());
+            if (dequeued > 0) {
+                all_consumed.insert(all_consumed.end(), results.begin(), results.begin() + dequeued);
+            } else {
+                std::this_thread::yield();  // Give producer time to produce
+            }
+        }
+
+        consumer_done = true;
+    });
+
+    producer.join();
+    consumer.join();
+
+    // Verify all items were produced and consumed in order
+    EXPECT_EQ(all_produced, all_consumed);
+    EXPECT_EQ(all_consumed.size(), 20);
+}
+
+// Test timed bulk enqueue operations
+TEST(SPSCQueueTest, TimedBulkEnqueue) {
+    SPSCQueue<int, 4> queue;  // Actual capacity: 3
+
+    // Fill the queue
+    queue.try_enqueue(1);
+    queue.try_enqueue(2);
+    queue.try_enqueue(3);
+    EXPECT_TRUE(queue.is_full());
+
+    // Test timeout on full queue
+    std::vector<int> items = {4, 5, 6};
+    auto start = std::chrono::steady_clock::now();
+    size_t enqueued = queue.enqueue_bulk_for(items.begin(), items.size(), std::chrono::milliseconds(100));
+    auto end = std::chrono::steady_clock::now();
+
+    EXPECT_EQ(enqueued, 0);  // Should time out without enqueuing
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    EXPECT_GE(duration, 90);  // Allow for some timing variation
+
+    // Test successful timed enqueue after making space
+    int item;
+    queue.try_dequeue(item);  // Make space for one item
+    EXPECT_EQ(item, 1);
+
+    enqueued = queue.enqueue_bulk_for(items.begin(), items.size(), std::chrono::milliseconds(100));
+    EXPECT_EQ(enqueued, 1);  // Should enqueue one item
+
+    // Verify queue state
+    std::vector<int> results(3);
+    size_t dequeued = queue.try_dequeue_bulk(results.begin(), 3);
+    EXPECT_EQ(dequeued, 3);
+    EXPECT_EQ(results[0], 2);
+    EXPECT_EQ(results[1], 3);
+    EXPECT_EQ(results[2], 4);  // The first item from the timed bulk enqueue
+}
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/unittests/xrtMock/experimental/xrt_ip.h b/unittests/xrtMock/experimental/xrt_ip.h
index 5c58f04..09b0456 100644
--- a/unittests/xrtMock/experimental/xrt_ip.h
+++ b/unittests/xrtMock/experimental/xrt_ip.h
@@ -60,8 +60,8 @@ namespace xrt {
          * Constructor throws on error.
          */
         ip(const xrt::device& device, const xrt::uuid& xclbin_id, const std::string& name) {
-            FINN_LOG(Logger::getLogger(), loglevel::debug) << "[xrt::ip mock]"
-                                                           << "Create kernel with name: " << name;
+            FINN_LOG(loglevel::debug) << "[xrt::ip mock]"
+                                      << "Create kernel with name: " << name;
             kernel_device.emplace_back(device);
             kernel_uuid.emplace_back(xclbin_id);
             kernel_name.emplace_back(name);
diff --git a/unittests/xrtMock/xrt/xrt_bo.cpp b/unittests/xrtMock/xrt/xrt_bo.cpp
index 0eaebdf..f6fec88 100644
--- a/unittests/xrtMock/xrt/xrt_bo.cpp
+++ b/unittests/xrtMock/xrt/xrt_bo.cpp
@@ -4,17 +4,17 @@
 #include "xrt_device.h"
 
 void xrt::bo::sync(xclBOSyncDirection syncMode) {
-    // FINN_LOG(logger, loglevel::debug) << "(xrtMock) xrt::bo object synced!\n";
+    // FINN_LOG(loglevel::debug) << "(xrtMock) xrt::bo object synced!\n";
 }
 
 void xrt::bo::sync(xclBOSyncDirection dir, size_t sz, size_t offset) {
-    // FINN_LOG(logger, loglevel::debug) << "(xrtMock) xrt::bo object synced!\n";
+    // FINN_LOG(loglevel::debug) << "(xrtMock) xrt::bo object synced!\n";
 }
 
 /**
  * @brief Destroy the xrt::bo object and free the memory map
  */
 xrt::bo::~bo() {
-    FINN_LOG(logger, loglevel::debug) << "(xrtMock) Destroying and freeing xrt::bo object!\n";
+    FINN_LOG(loglevel::debug) << "(xrtMock) Destroying and freeing xrt::bo object!\n";
     free(memmap);
 }
\ No newline at end of file
diff --git a/unittests/xrtMock/xrt/xrt_bo.h b/unittests/xrtMock/xrt/xrt_bo.h
index 8abd4d9..ea2b141 100644
--- a/unittests/xrtMock/xrt/xrt_bo.h
+++ b/unittests/xrtMock/xrt/xrt_bo.h
@@ -1,7 +1,7 @@
 #ifndef XRT_BO_H
 #define XRT_BO_H
 
-#include <FINNCppDriver/utils/Logger.h>
+#include <FINNCppDriver/utils/Logger.hpp>
 
 #include "../xrt.h"
 #include "xrt_device.h"
@@ -15,8 +15,6 @@ namespace xrt {
 
         void* memmap = nullptr;
 
-        logger_type& logger;
-
          public:
         /**
          * XCL BO Flags bits layout
@@ -61,12 +59,10 @@ namespace xrt {
             p2p = XRT_BO_FLAGS_P2P,
             svm = XRT_BO_FLAGS_SVM,
         };
-        bo(xrt::device pDevice, size_t pBytesize, unsigned int pGroup) : device(pDevice), byteSize(pBytesize), group(pGroup), logger(Logger::getLogger()) { FINN_LOG(logger, loglevel::debug) << "(xrtMock) xrt::bo object created!\n"; }
-        bo(const xrt::device& pDevice, size_t pBytesize, bo::flags flags, uint32_t pGroup) : device(pDevice), byteSize(pBytesize), group(pGroup), logger(Logger::getLogger()) {
-            FINN_LOG(logger, loglevel::debug) << "(xrtMock) xrt::bo object created with flag!\n";
-        }
+        bo(xrt::device pDevice, size_t pBytesize, unsigned int pGroup) : device(pDevice), byteSize(pBytesize), group(pGroup) { FINN_LOG(loglevel::debug) << "(xrtMock) xrt::bo object created!\n"; }
+        bo(const xrt::device& pDevice, size_t pBytesize, bo::flags flags, uint32_t pGroup) : device(pDevice), byteSize(pBytesize), group(pGroup) { FINN_LOG(loglevel::debug) << "(xrtMock) xrt::bo object created with flag!\n"; }
 
-        bo(bo&& other) noexcept : device(std::move(other.device)), byteSize(other.byteSize), group(other.group), memmap(nullptr), logger(Logger::getLogger()) { std::swap(memmap, other.memmap); }
+        bo(bo&& other) noexcept : device(std::move(other.device)), byteSize(other.byteSize), group(other.group), memmap(nullptr) { std::swap(memmap, other.memmap); }
 
         void sync(xclBOSyncDirection);
         void sync(xclBOSyncDirection dir, size_t sz, size_t offset);
@@ -81,7 +77,7 @@ namespace xrt {
          */
         template<typename T>
         T map() {
-            FINN_LOG(logger, loglevel::debug) << "(xrtMock) Map created from xrt::bo with byte size " << byteSize << "!\n";
+            FINN_LOG(loglevel::debug) << "(xrtMock) Map created from xrt::bo with byte size " << byteSize << "!\n";
             T createdMap = static_cast<T>(malloc(byteSize));
             memmap = createdMap;
             return createdMap;
diff --git a/unittests/xrtMock/xrt/xrt_kernel.cpp b/unittests/xrtMock/xrt/xrt_kernel.cpp
index 0fa3a74..5025bd0 100644
--- a/unittests/xrtMock/xrt/xrt_kernel.cpp
+++ b/unittests/xrtMock/xrt/xrt_kernel.cpp
@@ -1,15 +1,14 @@
 #include "xrt_kernel.h"
 
-#include <FINNCppDriver/utils/Logger.h>
-
+#include <FINNCppDriver/utils/Logger.hpp>
 #include <iostream>
 
 #include "../ert.h"
 
 namespace xrt {
     kernel::kernel(const xrt::device& device, const xrt::uuid& xclbin_id, const std::string& name, cu_access_mode mode) {
-        FINN_LOG(Logger::getLogger(), loglevel::debug) << "[xrt::kernel mock]"
-                                                       << "Create kernel with name: " << name;
+        FINN_LOG(loglevel::debug) << "[xrt::kernel mock]"
+                                  << "Create kernel with name: " << name;
         kernel_device.emplace_back(device);
         kernel_uuid.emplace_back(xclbin_id);
         kernel_name.emplace_back(name);