diff --git a/test/common.py b/test/common.py
index fd03bdedcbaf9..274a169fca6e6 100644
--- a/test/common.py
+++ b/test/common.py
@@ -1844,7 +1844,7 @@ def get_library(self, name, generated_libs, configure=['sh', './configure'],  #
     if env_init is None:
       env_init = {}
     if make_args is None:
-      make_args = ['-j', str(shared.get_num_cores())]
+      make_args = ['-j', str(utils.get_num_cores())]
 
     build_dir = self.get_build_dir()
 
diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index 20e113be96767..570c70a0e880b 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -441,4 +441,4 @@ def __init__(self, co):
 def num_cores():
   if NUM_CORES:
     return int(NUM_CORES)
-  return multiprocessing.cpu_count()
+  return utils.get_num_cores()
diff --git a/tools/js_optimizer.py b/tools/js_optimizer.py
index 2c99bf9ff8209..02beac71b45ea 100755
--- a/tools/js_optimizer.py
+++ b/tools/js_optimizer.py
@@ -244,7 +244,7 @@ def check_symbol_mapping(p):
     # if we are making source maps, we want our debug numbering to start from the
     # top of the file, so avoid breaking the JS into chunks
 
-    intended_num_chunks = round(shared.get_num_cores() * NUM_CHUNKS_PER_CORE)
+    intended_num_chunks = round(utils.get_num_cores() * NUM_CHUNKS_PER_CORE)
     chunk_size = min(MAX_CHUNK_SIZE, max(MIN_CHUNK_SIZE, total_size / intended_num_chunks))
     chunks = chunkify(funcs, chunk_size)
 
diff --git a/tools/shared.py b/tools/shared.py
index 400c9c396d362..c5c927e3e5912 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -115,10 +115,6 @@ def run_process(cmd, check=True, input=None, *args, **kw):
   return ret
 
 
-def get_num_cores():
-  return int(os.environ.get('EMCC_CORES', os.cpu_count()))
-
-
 def returncode_to_str(code):
   assert code != 0
   if code < 0:
@@ -169,7 +165,7 @@ def get_finished_process():
       except subprocess.TimeoutExpired:
         pass
 
-  num_parallel_processes = get_num_cores()
+  num_parallel_processes = utils.get_num_cores()
   temp_files = get_temp_files()
   i = 0
   num_completed = 0
diff --git a/tools/system_libs.py b/tools/system_libs.py
index 8be52d381b33e..1d88c31a29dfd 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -159,7 +159,7 @@ def get_top_level_ninja_file():
 
 
 def run_ninja(build_dir):
-  cmd = ['ninja', '-C', build_dir, f'-j{shared.get_num_cores()}']
+  cmd = ['ninja', '-C', build_dir, f'-j{utils.get_num_cores()}']
   if shared.PRINT_SUBPROCS:
     cmd.append('-v')
   shared.check_call(cmd, env=clean_env())
@@ -538,7 +538,7 @@ def build_objects(self, build_dir):
       # Choose a chunk size that is large enough to avoid too many subprocesses
       # but not too large to avoid task starvation.
       # For now the heuristic is to split inputs by 2x number of cores.
-      chunk_size = max(1, len(objects) // (2 * shared.get_num_cores()))
+      chunk_size = max(1, len(objects) // (2 * utils.get_num_cores()))
       # Convert batches to commands.
       for cmd, srcs in batches.items():
         cmd = list(cmd)
diff --git a/tools/utils.py b/tools/utils.py
index 6bc0bae630584..36ffce1078daa 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -112,6 +112,20 @@ def delete_contents(dirname, exclude=None):
       delete_file(entry)
 
 
+def get_num_cores():
+  # Prefer `os.process_cpu_count` when available (3.13 and above) since
+  # it takes into account thread affinity.
+  # Fall back to `os.sched_getaffinity` where available and finally
+  # `os.cpu_count`, which should work everywhere.
+  if hasattr(os, 'process_cpu_count'):
+    cpu_count = os.process_cpu_count()
+  elif hasattr(os, 'sched_getaffinity'):
+    cpu_count = len(os.sched_getaffinity(0))
+  else:
+    cpu_count = os.cpu_count()
+  return int(os.environ.get('EMCC_CORES', cpu_count))
+
+
 # TODO(sbc): Replace with functools.cache, once we update to python 3.9
 memoize = functools.lru_cache(maxsize=None)