In [49]:
import math
import scipy.special 

In [50]:
N = 900
K = 30
invlist_length = N / K

## Count combinations -- exact 

In [51]:
# exact ROC 

bits_per_id = K * (math.log2(N) * invlist_length - math.log2(scipy.special.factorial(invlist_length))) / N
bits_per_id

6.22347894648459

In [52]:
# exact string 

s = 0 
for k in range(K):
    log_pp = math.log2(scipy.special.comb(N - k * invlist_length, invlist_length))
    s += log_pp

bits_per_id = s / N
bits_per_id

4.787709208088086

## Count combinations -- approximate

In [56]:
N = 10**6 
K = 10**3
invlist_length = N // K

In [57]:
# with ROC (incorrect, uses n*log(n))
bits_per_id = K * (math.log2(N) * invlist_length - invlist_length * math.log2(invlist_length)) / N
bits_per_id

9.965784284662089

In [58]:
# with ROC (correct, uses log(n!))
cached_factorials = np.cumsum(np.log2(np.arange(N + 2) + 1), dtype=np.float64)
cached_factorials = np.r_[0.0, cached_factorials[:-1]]

bits_per_id = K * (math.log2(N) * invlist_length - cached_factorials[invlist_length]) / N
bits_per_id

11.402170565119398

In [None]:
# with strings 


def log_ncomb(n, m): 
    # from https://math.stackexchange.com/questions/64716/approximating-the-logarithm-of-the-binomial-coefficient
    if m == n: return 0
    return n * math.log2(n) - (n - m) * math.log2(n - m) - m * math.log2(m)

s = 0 
for k in range(K):
    log_pp = log_ncomb(N - k * invlist_length, invlist_length)
    s += log_pp

bits_per_id = s / N
bits_per_id

9.96578428466196

## Count with batched factorials

$$\log\binom{K \cdot n}{n} + \log\binom{(K-1) \cdot n}{n} + \dots + \log\binom{2n}{n} + \log\binom{n}{n}$$

In [None]:
import numpy as np
import scipy

N = 1_000_000
K = 1_000
n = invlist_length = N // K

cached_factorials = np.cumsum(np.log2(np.arange(N + 2) + 1), dtype=np.float64)
cached_factorials = np.r_[0.0, cached_factorials[:-1]]
f = lambda x: np.log2(scipy.special.factorial(x))

bits_per_id = 0
for j in range(K):
    bits_per_id += cached_factorials[N - j*n] - cached_factorials[N - (j+1)*n]
bits_per_id += -K * cached_factorials[n] # -K * log(n!)
bits_per_id /= N

bits_per_id 

9.959486815763544

In [None]:
scipy.special.comb(2*invlist_length, invlist_length)

inf