In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext Cython
%timeit

import cython
import numpy as np
import matplotlib
matplotlib.style.use('ggplot')
import pandas


In [2]:
cython.__version__

'3.0.2'

# Example 1


If you example speeds too much time defining and array that you might not necesarily need in another algorithm it might be worth to devectorize.

In [3]:
def vectorized_sum_allocating_array(n=1_000_000):
    x = np.arange(n)
    return np.sum(x)

In [4]:
vectorized_sum_allocating_array(n=1_000_000)

499999500000

In [5]:
%timeit vectorized_sum_allocating_array(n=1_000_000)

332 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
def sum_up_to_n(n=1_000_000):
    s = 0
    for i in range(n):
        s +=i
    return s

In [7]:
%timeit sum_up_to_n(1_000_000)

49.3 ms ± 353 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%%cython 

cpdef long sum_up_to_n_cython(long n):
    cdef:
        long s = 0
        long i

    for i in range(n):
        s +=i
    return s

In [9]:
%timeit sum_up_to_n_cython(1_000_000)

54.5 ns ± 2.02 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [10]:
%timeit sum_up_to_n_cython(100_000_000)

53.7 ns ± 0.642 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [11]:
sum_up_to_n_cython(100_000_000)

4999999950000000

#### Pure python mode

In [12]:

def sum_up_to_n_cython_pure_python(n: cython.long):
    s: cython.long = 0
    n: cython.long 
    i: cython.long 
                
    for i in range(n):
        s +=i
    return s

In [13]:
sum_up_to_n_cython_pure_python(1_000_000)

499999500000

In [14]:
py_time = %timeit -o sum_up_to_n_cython_pure_python(1_000_000)

48.4 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%cython

import cython

def sum_up_to_n_cython_pure_python(n: cython.long):
    s: cython.long = 0
    n: cython.long 
    i: cython.long 
        
    for i in range(n):
        s +=i
    return s

In [16]:
sum_up_to_n_cython_pure_python(1_000_000)

499999500000

In [17]:
cy_time = %timeit -o sum_up_to_n_cython_pure_python(1_000_000)

45.5 ns ± 0.295 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


Another way to do it

In [18]:
%%cython 
import cython

@cython.returns(cython.long)
@cython.locals(s=cython.long, i=cython.long, n=cython.long)
def sum_up_to_n_cython_pure_python(n):
    s = 0
    for i in range(n):
        s +=i
    return s

In [19]:
sum_up_to_n_cython_pure_python(1_000_000)

499999500000

In [20]:
cy_time2 = %timeit -o sum_up_to_n_cython_pure_python(1_000_000)

45.2 ns ± 0.202 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [21]:
py_time.average / cy_time.average 

1063354.7963181522

In [22]:
py_time.average, cy_time.average 

(0.04838811191428566, 4.550514285714295e-08)

In [23]:
py_time.average/ cy_time.average 

1063354.7963181522

## Example 2

Similar to the previous one, this example allocates arrays that are not needed

In [24]:
n_samples = 1_000_000
n_queries = 1
n_features = 10

query_vector = np.random.random((1, n_features))
X = np.random.random((n_samples,n_features))
X.shape, query_vector.shape

((1000000, 10), (1, 10))

In [25]:
def euclidean_naive(x,B):
    return np.sqrt(np.sum((x-B)**2,axis=1))

In [26]:
%timeit euclidean_naive(query_vector, X)

24.9 ms ± 225 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
euclidean_naive(query_vector, X)

array([1.66036199, 1.67123203, 1.20434742, ..., 1.49072671, 1.67138599,
       1.72398514])

In [28]:
xB = np.empty(X.shape)

def euclidean(x,B, xB):
    result = np.empty(B.shape[0])
    np.subtract(x,B,out=xB)
    np.square(xB,out=xB)
    np.sum(xB,axis=1,out=result)
    np.sqrt(result,out=result)
    return result

In [29]:
%timeit euclidean(query_vector, X, xB)

24.8 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%%cython 
import numpy as np
import cython 
from libc.math cimport pow, sqrt

@cython.boundscheck(False)  # Deactivate bounds checking
def cy_euclidean(double[:,:] q,double[:,:] X):
    cdef int n_samples = X.shape[0]
    cdef int n_features = q.shape[1]
    cdef double res=0
    cdef double[:] result = np.zeros(len(X), dtype="double")

    for m in range(n_samples):
        res = 0.
        for i in range(n_features):
            res += pow(q[0,i]- X[m, i],2)
        result[m] = sqrt(res)
    
    return np.array(result)

In [31]:
cy_time = %timeit -o cy_euclidean(query_vector, X)

4.53 ms ± 9.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
# checks that the result of the vectorized and non vectorized 
# implementations produces the same results
py_distances = euclidean_naive(query_vector, X)
cy_distances = cy_euclidean(query_vector, X)
np.testing.assert_allclose(py_distances,cy_distances, rtol=1e-6, atol=0.00)

## Example 3

Find in a list of strings the first possiton that matches a given string

In [33]:
%%cython
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int index_cython(list l, str q):
    cdef:
        int pos = 0, k = 0
        int n_l = len(l)
        str u

    for k in range(n_l):
        u = l[k]
        if u == q:
            return pos
        else:
            pos = pos +1
    raise ValueError

In [34]:
ids = [str(i) for i in range(1_000_000)]
q = '900000'
ids.index(q)

900000

In [35]:
%timeit index_cython(ids,q)

4.9 ms ± 64.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
%timeit ids.index(q)

10.8 ms ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Example 4

This example, based on creating random numbers and doing checks, shows that if most of the time is spend in creating random numbers (with numpy) it is not worth to devectorize your code. Unless you have a fast random sampling method in C that you can call many times fast enough.

In [37]:
def dart():
    x,y = np.random.random(), np.random.random()
    return (x*x + y*y <= 1)

def pi(n):
    dart_counts = sum((dart() for _ in range(n)))
    return dart_counts*4/n

In [38]:
N = 100_000
py_time = %timeit -o pi(N)

85.2 ms ± 556 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [39]:
def np_pi(n):
    x = np.random.random(n)
    y = np.random.random(n)
    dart_counts = (x*x + y*y <= 1).sum()
    
    return dart_counts*4/n

In [40]:
np_time = %timeit -o np_pi(N)

1.1 ms ± 1.87 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [41]:
print(f'vectorized version is {round(py_time.average /np_time.average,1)}x faster')

vectorized version is 77.6x faster


In [42]:
%%cython -a
import numpy as np
import cython 

#@cython.boundscheck(False)  # Deactivate bounds checking
cpdef float cy_pi(int n):
    cdef:
        int counter = 0
        int flag
        float x, y

    for i in range(n):
        x = np.random.random()
        y = np.random.random()
        counter += x*x + y*y <= 1
        
    return counter * 4 / n

In [43]:
# This example is slow and most of the time is spend calling at 
# the np.random.random
cy_time = %timeit -o cy_pi(N)

62.9 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
print(f'vectorized version is {round(py_time.average /np_time.average,1)}x faster')

vectorized version is 77.6x faster


In [45]:
%%cython -a
import numpy as np
import cython 
from libc.stdlib cimport rand, RAND_MAX

@cython.cdivision(True)
@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)  # Deactivate bounds checking
cpdef float cy_pi(n: cython.int ):
    counter: cython.int  
    flag: cython.int 
    x: cython.float
    y: cython.float
    i: cython.int 
        
    for i in range(n):
        x = rand() / RAND_MAX 
        y = rand() / RAND_MAX 
        counter += x*x + y*y <= 1
        
    return counter * 4 / n

In [46]:
cy_time = %timeit -o cy_pi(N)

1.38 ms ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [47]:
%%cython -a
import numpy as np
import cython 
from cython.parallel import prange
from libc.stdlib cimport rand, RAND_MAX

@cython.cdivision(True)
@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)  # Deactivate bounds checking
cpdef float cy_par_pi(n: cython.int ):
    counter: cython.int  
    flag: cython.int 
    i: cython.int
    counter = 0
    
    cdef double[:] x=np.random.random(n)
    cdef double[:] y=np.random.random(n)

    for i in range(n):
        x_ = x[i]
        y_ = y[i]
        counter += x_*x_ + y_*y_ <= 1
        
    return counter * 4 / n

In [48]:
cy_time = %timeit -o cy_par_pi(N)

1.03 ms ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
