# Requirements

In [5]:
from numba import config, njit, threading_layer
import numpy as np

Compute $\pi$ by generating random points in a square and counting how many there are in the circle inscribed in the square.

In [2]:
def compute_pi(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [3]:
@njit
def compute_pi_jit(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

In [4]:
@njit(['float64(int64)'])
def compute_pi_jit_sign(nr_tries):
    hits = 0
    for _ in range(nr_tries):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            hits += 1
    return 4.0*hits/nr_tries

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
NameError: name 'random' is not defined

In [None]:
%timeit compute_pi(100_000)

In [None]:
%timeit compute_pi_jit(100_000)

In [None]:
%timeit compute_pi_jit_sign(np.int64(100_000))

Using numba's just-in-time compiler significantly speeds up the computations.

# Quadrature $\pi$

We will use the following method to compute $\pi$: the definite integral
$$
\frac{\pi}{2} = \int_{-1}^{1} \sqrt{1 - x^2} dx
$$

In [6]:
@njit
def quad_pi_jit(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in range(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

We can implement this so that the loop can be parallelized (numba cannot deal with reductions).

In [7]:
config.THREADING_LAYER = 'omp'

In [8]:
@njit(parallel=True)
def quad_pi_omp(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in range(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

The pure numpy implementation for comparison.

In [9]:
%timeit quad_pi_jit(100_000_000)

305 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit quad_pi_omp(100_000_000)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


 
 Parallel Accelerator Optimizing:  Function quad_pi_omp, 
/tmp/ipykernel_6985/3328431449.py (1)  


Parallel loop listing for  Function quad_pi_omp, /tmp/ipykernel_6985/3328431449.py (1) 
--------------------------------------------|loop #ID
@njit(parallel=True)                        | 
def quad_pi_omp(nr_steps):                  | 
    delta = 2.0/nr_steps                    | 
    x = np.linspace(-1.0, 1.0, nr_steps)----| #0
    f = np.empty_like(x)                    | 
    for i in range(x.size):                 | 
        f[i] = np.sqrt(1.0 - x[i]**2)       | 
    return 2.0*f.sum()*delta----------------| #1
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
243 ms ± 60.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The parallized version is faster, but the parallel efficiency is far from great.

In [11]:
threading_layer()

'omp'

In [16]:
config.THREADING_LAYER = 'tbb'

In [17]:
@njit(parallel=True)
def quad_pi_tbb(nr_steps):
    delta = 2.0/nr_steps
    x = np.linspace(-1.0, 1.0, nr_steps)
    f = np.empty_like(x)
    for i in range(x.size):
        f[i] = np.sqrt(1.0 - x[i]**2)
    return 2.0*f.sum()*delta

In [18]:
%timeit quad_pi_tbb(100_000_000)

 
 Parallel Accelerator Optimizing:  Function quad_pi_tbb, 
/tmp/ipykernel_6985/3620572884.py (1)  


Parallel loop listing for  Function quad_pi_tbb, /tmp/ipykernel_6985/3620572884.py (1) 
--------------------------------------------|loop #ID
@njit(parallel=True)                        | 
def quad_pi_tbb(nr_steps):                  | 
    delta = 2.0/nr_steps                    | 
    x = np.linspace(-1.0, 1.0, nr_steps)----| #4
    f = np.empty_like(x)                    | 
    for i in range(x.size):                 | 
        f[i] = np.sqrt(1.0 - x[i]**2)       | 
    return 2.0*f.sum()*delta----------------| #5
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
219 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
threading_layer()

'omp'