# Requirements

In [20]:
from numba import config, njit, threading_layer, prange
import numpy as np
import random

Compute $\pi$ by generating random points in a square and counting how many there are in the circle inscribed in the square.

In [2]:
def compute_pi(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [3]:
@njit
def compute_pi_jit(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [4]:
@njit(['float64(int64)'])
def compute_pi_jit_sign(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [21]:
@njit(parallel=True)
def compute_pi_parallel(nr_tries):
    hits = 0
    for _ in prange(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [5]:
%timeit compute_pi(100_000)

13.5 ms ± 204 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%timeit compute_pi_jit(100_000)

634 µs ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
%timeit compute_pi_jit_sign(np.int64(100_000))

628 µs ± 5.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [23]:
%timeit compute_pi_parallel(100_000)

94.6 µs ± 1.59 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


Using numba's just-in-time compiler significantly speeds up the computations.

# Quadrature $\pi$

We will use the following method to compute $\pi$: the definite integral
$$
  \pi = 2 \int_{-1}^{1} \sqrt{1 - x^2} dx
$$

In [8]:
@njit
def quad_pi_jit(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in range(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

We can implement this so that the loop can be parallelized (numba cannot deal with reductions).

In [9]:
config.THREADING_LAYER = 'omp'

In [24]:
@njit(parallel=True)
def quad_pi_omp(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in prange(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

The pure numpy implementation for comparison.

In [25]:
%timeit quad_pi_jit(100_000_000)

387 ms ± 32.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%timeit quad_pi_omp(100_000_000)

334 ms ± 23.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The parallized version is faster, but the parallel efficiency is far from great.

In [34]:
threading_layer()

'omp'

In [35]:
config.THREADING_LAYER = 'tbb'

In [36]:
@njit(parallel=True)
def quad_pi_tbb(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in prange(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

In [37]:
%timeit quad_pi_tbb(100_000_000)

334 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
threading_layer()

'omp'

It is somewhat disconcerting that the threading layer seems to be OpenMP only.