# This notebook compares numerical integration routines on GPU and CPU

## Init

In [1]:
from numpy.polynomial.legendre import leggauss
from numpwd.integrate.angular import ReducedAngularPolynomial, get_x_mesh, get_phi_mesh
from numpwd.integrate.mesh.trns import get_trns_mesh
import numpy as np

from sympy import S, sqrt, expand_trig
from pandas import DataFrame, set_option, Series

from numpwd.integrate.numeric import ExpressionMap
from numpwd.qchannels.spin import get_spin_matrix_element, dict_to_data
from numpwd.integrate.analytic import SPHERICAL_BASE_SUBS, ANGLE_BASE_SUBS, integrate
from numpwd.integrate.angular import ReducedAngularPolynomial

set_option("max_colwidth", None)

## Set up the test matrix kernel for integration

The matrix element corresponds to
$$
    \sigma_1 \cdot \vec l_1 \; \sigma_2 \cdot \vec l_2
$$
where
$$
    \vec l_{1/2} = \frac{\vec q}{2} \pm ( \vec p_i - \vec p_o)
$$

In [2]:
sig1_k1 = S("sigma11 * l11 + sigma12  * l12 + sigma13 * l13")
sig2_k2 = S("sigma21 * l21 + sigma22  * l22 + sigma23 * l23")

kernel = sig1_k1 * sig2_k2
kernel = kernel.subs(
    {"l11": "+p_i1 - p_o1", "l12": "+p_i2 - p_o2", "l13": "+p_i3 - p_o3 + q/2"}
)
kernel = kernel.subs(
    {"l21": "-p_i1 + p_o1", "l22": "-p_i2 + p_o2", "l23": "-p_i3 + p_o3 + q/2"}
)
kernel

(sigma11*(p_i1 - p_o1) + sigma12*(p_i2 - p_o2) + sigma13*(p_i3 - p_o3 + q/2))*(sigma21*(-p_i1 + p_o1) + sigma22*(-p_i2 + p_o2) + sigma23*(-p_i3 + p_o3 + q/2))

## Decompose the kernel

The below routine computes
$$
    \left\langle s_o m_{s_o} \big \vert O \big \vert s_i m_i \right \rangle
$$
wher $s, m_s$ are the in and outgoing spins of the two nucleon system.

In [3]:
pwd = DataFrame(get_spin_matrix_element(kernel))
print("Number of elements:", len(pwd))
pwd.head()

Number of elements: 14


Unnamed: 0,s_o,ms_o,s_i,ms_i,val
0,0,0,0,0,p_i1**2 - 2*p_i1*p_o1 + p_i2**2 - 2*p_i2*p_o2 + p_i3**2 - 2*p_i3*p_o3 + p_o1**2 + p_o2**2 + p_o3**2 - q**2/4
1,0,0,1,-1,sqrt(2)*q*(-p_i1 + I*p_i2 + p_o1 - I*p_o2)/2
2,0,0,1,1,sqrt(2)*q*(-p_i1 - I*p_i2 + p_o1 + I*p_o2)/2
3,1,-1,0,0,sqrt(2)*q*(-p_i1 - I*p_i2 + p_o1 + I*p_o2)/2
4,1,1,0,0,sqrt(2)*q*(-p_i1 + I*p_i2 + p_o1 - I*p_o2)/2


In [4]:
pwd.loc[0, "val"]

p_i1**2 - 2*p_i1*p_o1 + p_i2**2 - 2*p_i2*p_o2 + p_i3**2 - 2*p_i3*p_o3 + p_o1**2 + p_o2**2 + p_o3**2 - q**2/4

## Going to the spherical base

We now subsitude in the spherical coordinates with $p, \theta = \arccos(x)$ and $\phi$.

In [5]:
SPHERICAL_BASE_SUBS

{'p_i1': 'p_i * sqrt(1 - x_i**2) * cos(phi_i)',
 'p_i2': 'p_i * sqrt(1 - x_i**2) * sin(phi_i)',
 'p_i3': 'p_i * x_i',
 'p_o1': 'p_o * sqrt(1 - x_o**2) * cos(phi_o)',
 'p_o2': 'p_o * sqrt(1 - x_o**2) * sin(phi_o)',
 'p_o3': 'p_o * x_o'}

And substitute in the CMS coordinates for the $\phi$s

In [6]:
ANGLE_BASE_SUBS

{'phi_i': 'Phi + phi/2', 'phi_o': 'Phi - phi/2'}

In [7]:
df = pwd.copy()

df["val"] = df.apply(
    lambda el: expand_trig(
        el["val"]
        .subs(SPHERICAL_BASE_SUBS)
        .subs(ANGLE_BASE_SUBS)
        .rewrite(S("exp"))
        .expand()
    ),
    axis=1,
)
print("Number of elements:", len(df))
df.head()

Number of elements: 14


Unnamed: 0,s_o,ms_o,s_i,ms_i,val
0,0,0,0,0,p_i**2 - 2*p_i*p_o*x_i*x_o - p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(I*phi) - p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(-I*phi) + p_o**2 - q**2/4
1,0,0,1,-1,-sqrt(2)*p_i*q*sqrt(1 - x_i**2)*exp(-I*Phi)*exp(-I*phi/2)/2 + sqrt(2)*p_o*q*sqrt(1 - x_o**2)*exp(-I*Phi)*exp(I*phi/2)/2
2,0,0,1,1,-sqrt(2)*p_i*q*sqrt(1 - x_i**2)*exp(I*Phi)*exp(I*phi/2)/2 + sqrt(2)*p_o*q*sqrt(1 - x_o**2)*exp(I*Phi)*exp(-I*phi/2)/2
3,1,-1,0,0,-sqrt(2)*p_i*q*sqrt(1 - x_i**2)*exp(I*Phi)*exp(I*phi/2)/2 + sqrt(2)*p_o*q*sqrt(1 - x_o**2)*exp(I*Phi)*exp(-I*phi/2)/2
4,1,1,0,0,-sqrt(2)*p_i*q*sqrt(1 - x_i**2)*exp(-I*Phi)*exp(-I*phi/2)/2 + sqrt(2)*p_o*q*sqrt(1 - x_o**2)*exp(-I*Phi)*exp(I*phi/2)/2


In [8]:
df.loc[0, "val"]

p_i**2 - 2*p_i*p_o*x_i*x_o - p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(I*phi) - p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(-I*phi) + p_o**2 - q**2/4

## Analytically integrate out $\Phi$

As will be needed later, we run
$$
    O_{s_o m_{s_o} s_i m_{s_i} m_\lambda}(x_o, x_i, \phi)
    \int\limits_0^{\Phi} d\Phi O_{s_o m_{s_o} s_i m_{s_i}}(x_o, x_i, \phi, \Phi) \exp\{-i m_\lambda  \Phi\}
$$

In [9]:
def integrate_out_big_phi(expr):
    res = {}
    for mla in range(-2, 3):
        res[mla] = integrate(expr * S(f"exp(-I*{mla}*Phi)"))

    return Series(res)

Because we have to do this for all possible $m_\lambda$ valuesm, there will be more elements.
However, since this expression scales as $p^2$, $|m_\lambda|$ is at most 2 (others are zero because of $Y_{lm}$ integrations).
In particular, only a set of certain $s, m_s$ and $m_\lambda$ combinations are non-zero, because this operator is a scalar.

In [10]:
tf = (
    df.set_index(["s_o", "ms_o", "s_i", "ms_i"])
    .val.apply(integrate_out_big_phi)
    .stack()
)
tf.index.names = ["s_o", "ms_o", "s_i", "ms_i", "mla"]

print("Number of all elements:", len(tf))
tf = (
    tf[tf != 0]
    .reset_index()
    .rename(columns={0: "val"})
    .set_index(["s_o", "ms_o", "s_i", "ms_i", "mla"])
    .sort_index()
)
print("Number of non-zero elements:", len(tf))
tf.head()

Number of all elements: 70
Number of non-zero elements: 14


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,val
s_o,ms_o,s_i,ms_i,mla,Unnamed: 5_level_1
0,0,0,0,0,2*pi*p_i**2 - 4*pi*p_i*p_o*x_i*x_o - 2*pi*p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(I*phi) - 2*pi*p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(-I*phi) + 2*pi*p_o**2 - pi*q**2/2
0,0,1,-1,-1,-sqrt(2)*pi*p_i*q*sqrt(1 - x_i**2)*exp(-I*phi/2) + sqrt(2)*pi*p_o*q*sqrt(1 - x_o**2)*exp(I*phi/2)
0,0,1,1,1,-sqrt(2)*pi*p_i*q*sqrt(1 - x_i**2)*exp(I*phi/2) + sqrt(2)*pi*p_o*q*sqrt(1 - x_o**2)*exp(-I*phi/2)
1,-1,0,0,1,-sqrt(2)*pi*p_i*q*sqrt(1 - x_i**2)*exp(I*phi/2) + sqrt(2)*pi*p_o*q*sqrt(1 - x_o**2)*exp(-I*phi/2)
1,-1,1,-1,0,-2*pi*p_i**2*x_i**2 + 4*pi*p_i*p_o*x_i*x_o - 2*pi*p_o**2*x_o**2 + pi*q**2/2


In [11]:
tf.loc[(0, 0, 0, 0, 0), "val"]

2*pi*p_i**2 - 4*pi*p_i*p_o*x_i*x_o - 2*pi*p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(I*phi) - 2*pi*p_i*p_o*sqrt(1 - x_i**2)*sqrt(1 - x_o**2)*exp(-I*phi) + 2*pi*p_o**2 - pi*q**2/2

# Numerical integrations

## Allocate angular meshs

In [12]:
NPHI = 20
NX = 30
NP1 = 40
NP2 = 20
NQ = 2

In [13]:
phi, wphi = get_phi_mesh(NPHI)
x, wx = leggauss(NX)
p, wp = get_trns_mesh(NP1, NP2)
q = np.linspace(0, 1, NQ)

## Allocate reduced angular polynomial

In [14]:
poly = ReducedAngularPolynomial(x, phi, lmax=4, wx=wx, wphi=wphi)

## Pick a latex expression...

In [15]:
expr = tf.iloc[1].val
mla = tf.iloc[1].name[-1]
print("m_lambda:", mla)
expr

m_lambda: -1


-sqrt(2)*pi*p_i*q*sqrt(1 - x_i**2)*exp(-I*phi/2) + sqrt(2)*pi*p_o*q*sqrt(1 - x_o**2)*exp(I*phi/2)

## ... and convert it to a tensor

In [16]:
op = ExpressionMap(expr, ("p_o", "p_i", "q", "x_o", "x_i", "phi"))
mat = op(p, p, q, x, x, phi)
mat.shape

(60, 60, 2, 30, 30, 20)

## And integrate over all angles

In [17]:
res = poly.integrate(mat, mla, max_chunk_size=10)

In [18]:
print("data shape:", list(res.values())[0].shape)
DataFrame(data=res.keys(), columns=["l_o", "l_i", "lambda", "m_lambda"])

data shape: (60, 60, 2)


Unnamed: 0,l_o,l_i,lambda,m_lambda
0,0,1,1,-1
1,0,2,2,-1
2,0,3,3,-1
3,0,4,4,-1
4,1,0,1,-1
...,...,...,...,...
75,4,4,4,-1
76,4,4,5,-1
77,4,4,6,-1
78,4,4,7,-1


# Benchmarks

## CPU

In [19]:
tensor_cpu = op(p, p, q, x, x, phi)
%timeit op(p, p, q, x, x, phi)
print("Size in GB:", tensor_cpu.nbytes / 1024 ** 3)

359 ms ± 4.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Size in GB: 1.9311904907226562


## GPU

Import `cupy` instead of `numpy` and you can run on GPUs

In [20]:
import cupy as cp
!nvidia-smi

Thu Jul  9 10:16:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 207...  Off  | 00000000:2D:00.0  On |                  N/A |
|  0%   47C    P8    29W / 215W |    510MiB /  7979MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

Move arrays to GPU by calling `cp.array`

In [21]:
phi_gpu = cp.array(phi)
x_gpu = cp.array(x)
p_gpu = cp.array(p)
q_gpu = cp.array(q)

wphi_gpu = cp.array(wphi)
wx_gpu = cp.array(wx)
wp_gpu = cp.array(wp)

The expression wrapper from latex to array also works for GPU arrays

In [22]:
tensor_gpu = op(p_gpu, p_gpu, q_gpu, x_gpu, x_gpu, phi_gpu)

And most importantly (porting the array back to CPU), they are still the same

In [23]:
np.abs(cp.asnumpy(tensor_gpu) - tensor_cpu).mean()

4.607829016736289e-16

In [24]:
poly_gpu = ReducedAngularPolynomial(x, phi, lmax=4, wx=wx_gpu, wphi=wphi_gpu)
poly_gpu.x = x_gpu
poly_gpu.phi = phi_gpu
poly_gpu.matrix = cp.array(poly_gpu.matrix)

In [25]:
res_gpu = poly_gpu.integrate(tensor_gpu, mla, max_chunk_size=5)

In [26]:
for key, val in res_gpu.items():
    assert np.abs(cp.asnumpy(val) - res[key]).mean() < 1.0e-14

print("Integrated arrays are the same as well")

Integrated arrays are the same as well


In [27]:
tensor_gpu = op(p_gpu, p_gpu, q_gpu, x_gpu, x_gpu, phi_gpu)
%timeit op(p_gpu, p_gpu, q_gpu, x_gpu, x_gpu, phi_gpu)
tensor_gpu.nbytes / 1024 ** 3

5.67 ms ± 91.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


1.9311904907226562

Compared to 360 ms ± 654 µs per loop  on CPU

Compared to 35.7 s ± 209 ms per loop on CPU

## Integration only comparison (no channel mapping)