diff --git a/util/threading.h b/util/threading.h index ec2a5758..5d952f4e 100644 --- a/util/threading.h +++ b/util/threading.h @@ -120,7 +120,8 @@ class PerClusterPools { // result in threads not running on their own core, we only allow for // *upper bounds* on the number of clusters and threads. The actual number of // clusters and threads are still limited by the detected topology. - PerClusterPools(size_t max_clusters, size_t max_threads) + PerClusterPools(size_t max_clusters, size_t max_threads, + size_t pin_offset = 0) : have_threading_support_(hwy::HaveThreadingSupport()), cores_per_cluster_(DetectCoresPerCluster()), outer_pool_(CapIfNonzero(cores_per_cluster_.size(), max_clusters)) { @@ -135,8 +136,11 @@ class PerClusterPools { inner_pools_.push_back(std::make_unique(num_threads)); if (num_threads > 1) { inner_pools_.back()->Run(0, num_threads, - [](uint64_t /*task*/, size_t thread) { - hwy::PinThreadToLogicalProcessor(thread); + [pin_offset, num_threads](uint64_t /*task*/, + size_t thread) { + auto lp = + (thread + pin_offset) % num_threads; + hwy::PinThreadToLogicalProcessor(lp); }); } return; @@ -153,7 +157,7 @@ class PerClusterPools { // (the one calling inner.Run()) to the enabled cores in the cluster. outer_pool_.Run( 0, outer_pool_.NumWorkers(), - [this](uint64_t outer, size_t outer_thread) { + [this, pin_offset](uint64_t outer, size_t outer_thread) { HWY_ASSERT(outer == outer_thread); // each outer has one task hwy::ThreadPool& inner = *inner_pools_[outer]; @@ -163,9 +167,10 @@ class PerClusterPools { HWY_ASSERT(inner.NumWorkers() <= cores.size()); inner.Run(0, inner.NumWorkers(), - [&cores](uint64_t task, size_t thread) { + [pin_offset, &cores](uint64_t task, size_t thread) { HWY_ASSERT(task == thread); // each inner has one task - hwy::PinThreadToLogicalProcessor(cores[task]); + auto lp = cores[(task + pin_offset) % cores.size()]; + hwy::PinThreadToLogicalProcessor(lp); }); }); }