In [None]:
"""swar_popcount_.ipynb"""

# Cell 01


def popcount(n: int) -> int:
    pop_count: int = 0
    while n > 0:
        pop_count = pop_count + n % 2
        n = n // 2
    return pop_count


popcount(95601)

In [None]:
# Cell 02

import numpy as np


def popcount_swar(x: np.ndarray) -> np.ndarray:
    """
    SWAR (“SIMD Within A Register”) parallel bit-count
    Compute the population count (number of 1-bits) of each 32-bit unsigned integer in x
    using the SWAR method. Works on NumPy arrays or scalars of dtype uint32.
    """
    # ensure unsigned 32-bit
    x = x.astype(np.uint32, copy=False)
    # pairwise bit sums
    x = x - ((x >> 1) & np.uint32(0x5555_5555))
    # 2-bit sums into 4-bit fields
    x = (x & np.uint32(0x3333_3333)) + ((x >> 2) & np.uint32(0x3333_3333))
    # 4-bit sums into 8-bit fields
    x = (x + (x >> 4)) & np.uint32(0x0F0F_0F0F)
    # fold into lower byte
    x = x + (x >> 8)
    x = x + (x >> 16)
    # mask to 6 bits (max popcount of 32 bits is 32 = 0b100000)
    return (x & np.uint32(0x3F)).astype(np.uint8)


print(popcount_swar(np.array([95601])))


In [None]:
# Cell 03

import time

np.random.seed(2019)
x = np.random.randint(10_000_000, 100_000_000, size=1_000_000, dtype=np.int32)

f = np.vectorize(popcount)  # type: ignore

start_time = time.perf_counter()
c = f(x)
elapsed_time = time.perf_counter() - start_time

print(x)
print(c)

print(f"Naive popcount run time (sec): {elapsed_time:.3f}")

In [None]:
# Cell 04

start_time = time.perf_counter()
c = popcount_swar(x)
elapsed_time = time.perf_counter() - start_time

print(x)
print(c)

print(f"SWAR popcount run time (sec): {elapsed_time:.3f}")