In [1]:
import numpy as np
import pandas as pd
from timeit import timeit

# Enhancing performance

We first look at native code compilation. Here we show 3 common methods for doing this using `numba` JIT compilation, `cython` AOT compilation, and direct wrapping of C++ code using `pybind11`. In general, `numba` is the simplest to use, while you have the most flexibility with `pybind11`. Which approach gives the best performance generally requires some experimentation.

Then we review common methods for concurrent execution of embarrassingly parallel code using `multiprocessing`, `concurrent.futures` and `joblib`. Comparison of performance using processes and threads is made, with a brief explanation of the Global Interpreter Lock (GIL).

More details for each of the libraries used to improve performance is provided in the course notebooks.

## Python

In [2]:
def cdist(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.
    
    xs has shape (m, p)
    ys has shape (n, p)
    
    Return value has shape (m, n)    
    """
    
    m, p = xs.shape
    n, p = ys.shape
    
    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res

### Sanity check

In [3]:
xs = np.arange(6).reshape(-1,2).astype('float')
ys = np.arange(4).reshape(-1, 2).astype('float')
zs = cdist(xs, ys)

In [4]:
zs

array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])

In [5]:
%timeit -r 3 -n 10 cdist(xs, ys)

60.7 µs ± 1.39 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [6]:
m = 1000
n = 1000
p = 100

X = np.random.random((m, p))
Y = np.random.random((n, p))

In [7]:
%%time

Z = cdist(X, Y)

CPU times: user 9.46 s, sys: 20 ms, total: 9.48 s
Wall time: 9.48 s


In [8]:
t0 = timeit(lambda : cdist(X, Y), number=1)

## Using `numba`

In [9]:
from numba import jit, njit

In [10]:
@njit
def cdist_numba(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.
    
    xs has shape (m, p)
    ys has shape (n, p)
    
    Return value has shape (m, n)    
    """
    
    m, p = xs.shape
    n, p = ys.shape
    
    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res

Check

In [11]:
assert(np.allclose(cdist(xs, ys), cdist_numba(xs, ys)))

In [12]:
%%time

Z = cdist_numba(X, Y)

CPU times: user 280 ms, sys: 8 ms, total: 288 ms
Wall time: 287 ms


In [13]:
t_numba = timeit(lambda : cdist_numba(X, Y), number=1)

### Unrolling

We can help `numba` by unrolling the code.

In [14]:
@njit
def cdist_numba1(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.
    
    xs has shape (m, p)
    ys has shape (n, p)
    
    Return value has shape (m, n)    
    """
    
    m, p = xs.shape
    n, p = ys.shape
    
    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            s = 0
            for k in range(p):
                s += (ys[j,k] - xs[i,k])**2
            res[i, j] = np.sqrt(s)
    return res

Check

In [15]:
assert(np.allclose(cdist(xs, ys), cdist_numba1(xs, ys)))

In [16]:
%%time

Z = cdist_numba1(X, Y)

CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 111 ms


In [17]:
t_numba1 = timeit(lambda : cdist_numba1(X, Y), number=1)

## Using `cython`

In [18]:
%load_ext cython

In [19]:
%%cython -a

import numpy as np

def cdist_cython(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.
    
    xs has shape (m, p)
    ys has shape (n, p)
    
    Return value has shape (m, n)    
    """
    
    m, p = xs.shape
    n, p = ys.shape
    
    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res

Check

In [20]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))

In [21]:
%%time

Z = cdist_cython(X, Y)

CPU times: user 9.88 s, sys: 8 ms, total: 9.88 s
Wall time: 9.88 s


In [22]:
t_cython = timeit(lambda : cdist_cython(X, Y), number=1)

In [23]:
%%cython -a

import cython
import numpy as np
from libc.math cimport sqrt, pow

@cython.boundscheck(False)
@cython.wraparound(False)
def cdist_cython1(double[:, :] xs, double[:, :] ys):
    """Returns pairwise distance between row vectors in xs and ys.
    
    xs has shape (m, p)
    ys has shape (n, p)
    
    Return value has shape (m, n)    
    """
    
    cdef int m, n, p
    
    m = xs.shape[0]
    n = ys.shape[0]
    p = xs.shape[1]
    
    cdef double[:, :] res = np.empty((m, n))
    
    cdef int i, j
    
    cdef double s
    for i in range(m):
        for j in range(n):
            s = 0.0
            for k in range(p):
                s += pow(ys[j,k] - xs[i,k], 2)                
            res[i, j] = sqrt(s)
    return res

Check

In [24]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))

In [25]:
%%time

Z = cdist_cython1(X, Y)

CPU times: user 112 ms, sys: 8 ms, total: 120 ms
Wall time: 120 ms


In [26]:
t_cython1 = timeit(lambda : cdist_cython1(X, Y), number=1)

## Using `pybind11`

In [29]:
%%file funcs.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['/usr/include/eigen3']
setup_pybind11(cfg)
%>

#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>

#include <cmath>
#include <Eigen/LU>

namespace py = pybind11;

using Eigen::MatrixXd;

MatrixXd cdist(MatrixXd xs, MatrixXd ys) {
    int m = xs.rows();
    int n = ys.rows();
    int p = ys.cols();
    
    MatrixXd res(m, n);
    
    double s;
    for (int i=0; i<m; i++) {
        for (int j=0; j<n; j++) {
            s = 0;
            for (int k=0; k<p; k++) {
                s += pow(ys(j,k) - xs(i,k), 2);
            }
            res(i,j) = sqrt(s);
        }
    }
    
    return res;
}

PYBIND11_MODULE(funcs, m) {
    m.doc() = "auto-compiled c++ extension";
    m.def("cdist", &cdist);
}

Overwriting funcs.cpp


Check. Note that the `cppimport.imp` only needs to be done once to build and wrap the C++ module. Once the module is built, it can subsequently be used like any other module.

In [30]:
import cppimport

funcs = cppimport.imp("funcs")
funcs.cdist(xs, ys)
assert(np.allclose(cdist(xs, ys), cdist_cython1(xs, ys)))

In [31]:
%%time

Z = funcs.cdist(X, Y)

CPU times: user 156 ms, sys: 4 ms, total: 160 ms
Wall time: 157 ms


In [32]:
t_pybind11 = timeit(lambda : funcs.cdist(X, Y), number=1)

### Tabulation

In [33]:
perf = pd.DataFrame(dict(
    methods = ['python', 'numba', 'numba1',  'cython', 'cython1', 'pybind11'],
    times = [t0, t_numba, t_numba1, t_cython, t_cython1, t_pybind11],
))

In [34]:
perf['speed-up'] = np.around(perf['times'][0]/perf['times'], 1)
perf

Unnamed: 0,methods,times,speed-up
0,python,9.340464,1.0
1,numba,0.277997,33.6
2,numba1,0.111793,83.6
3,cython,9.911234,0.9
4,cython1,0.119332,78.3
5,pybind11,0.154224,60.6


## Using multiple cores

The standard implementation of Python uses a Global Interpreter Lock (GIL). This means that only one thread can be run at any one time, and multiple threads work by time-slicing. Hence multi-threaded code with lots of latency can result in speed-ups, but multi-threaded code which is computationally intensive will not see any speed-up. For numerically intensive code, parallel code needs to be run in separate processes to see speed-ups.

First we see how to split the computation into pieces using a loop.

In [35]:
xs

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

In [36]:
ys

array([[0., 1.],
       [2., 3.]])

In [37]:
cdist(xs, ys)

array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])

In [38]:
res = np.concatenate([cdist(x, ys) for x in np.split(xs, 3, 0)])
res

array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])

In [39]:
%%time

Z = cdist(X, Y)

CPU times: user 9.98 s, sys: 8 ms, total: 9.99 s
Wall time: 9.99 s


### Using `multiprocessing`

In [40]:
from multiprocessing import Pool

In [41]:
%%time

with Pool(processes=4) as p:
    Z1 = p.starmap(cdist, [(X_, Y) for X_ in np.split(X, 100, 0)])
    Z1 = np.concatenate(Z1)

CPU times: user 32 ms, sys: 44 ms, total: 76 ms
Wall time: 2.94 s


Check

In [42]:
np.testing.assert_allclose(Z, Z1)

### Using `concurrent.futures

In [43]:
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

In [44]:
def cdist_(args):
    return cdist(*args)

In [45]:
%%time

with ProcessPoolExecutor(max_workers=4) as pool:
    Z2 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
    Z2 = np.concatenate(Z2)

CPU times: user 136 ms, sys: 88 ms, total: 224 ms
Wall time: 2.7 s


Check

In [46]:
np.testing.assert_allclose(Z, Z2)

### Using `joblib`

`joblib` provides parallel processing using a comprehension syntax.

In [47]:
from joblib import Parallel, delayed

In [48]:
%%time

Z3 = Parallel(n_jobs=4)(delayed(cdist)(X_, Y) for X_ in np.split(X, 100, 0))
Z3 = np.concatenate(Z3)

CPU times: user 144 ms, sys: 60 ms, total: 204 ms
Wall time: 2.68 s


Check

In [49]:
np.testing.assert_allclose(Z, Z3)

### Using threads

Note that there is no gain with using multiple threads for computationally intensive tasks because of the GIL.

In [50]:
%%time

with ThreadPoolExecutor(max_workers=4) as pool:
    Z4 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
    Z4 = np.concatenate(Z4)

CPU times: user 10.3 s, sys: 184 ms, total: 10.4 s
Wall time: 10.2 s


Check

In [51]:
np.testing.assert_allclose(Z, Z4)