In [1]:
import time
import numpy as np
import numba 
import cython
%load_ext cython

# Cython with numpy

In [177]:
%%cython -a
# cython: infer_types=True

cimport cython
import numpy as np

ctypedef fused my_type:
    int
    double
    long

@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)   # Deactivate negative indexing
def naive_convolve(my_type [:,::1] f, my_type [:,::1] g):
    # f is an image and is indexed by (v, w)
    # g is a filter kernel and is indexed by (s, t),
    #   it needs odd dimensions
    # h is the output image and is indexed by (x, y),
    #   it is not cropped
    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
        raise ValueError("Only odd dimensions on filter supported")

    # smid and tmid are number of pixels between the center pixel
    # and the edge, ie for a 5x5 filter they will be 2.
    #
    # The output size is calculated by adding smid, tmid to each
    # side of the dimensions of the input image.
    
    vmax = f.shape[0]
    wmax = f.shape[1]
    smax = g.shape[0]
    tmax = g.shape[1]
    smid = smax // 2
    tmid = tmax // 2
    xmax = vmax + 2*smid
    ymax = wmax + 2*tmid

    if my_type is int:
        dtype = np.intc
    elif my_type is double:
        dtype = np.double
    else:
        dtype = np.long
        
    h_np = np.zeros([xmax, ymax], dtype=dtype)
    cdef my_type [:,::1] h = h_np
    
    # Do convolution
    cdef my_type value
    for x in range(xmax):
        for y in range(ymax):
            # Calculate pixel value for h at (x,y). Sum one component
            # for each pixel (s, t) of the filter g.
            s_from = max(smid - x, -smid)
            s_to = min((xmax - x) - smid, smid + 1)
            t_from = max(tmid - y, -tmid)
            t_to = min((ymax - y) - tmid, tmid + 1)
            value = 0
            for s in range(s_from, s_to):
                for t in range(t_from, t_to):
                    v = x - smid + s
                    w = y - tmid + t
                    value += g[smid - s, tmid - t] * f[v, w]
            h[x, y] = value
    return h_np

In [124]:
N = 600
f = np.arange(N*N, dtype=np.int).reshape((N,N))
g = np.arange(81, dtype=np.int).reshape((9, 9))

In [184]:
# Pure Python version
%timeit naive_convolve(f, g)

18.2 s ± 920 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [129]:
# Cython version with no change in code -- only some slight improvment
%timeit naive_convolve(f, g)

12.9 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [132]:
# Cython version with some types added -- manual version
%timeit naive_convolve(f, g)

9.65 s ± 494 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [158]:
# Cython version with memory views
%timeit naive_convolve(f, g)

118 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [160]:
# Cython version with memory views and deactivated boundary checking & negative indexing
%timeit naive_convolve(f, g)

24.7 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [162]:
# Cython version with memory views, deactivated boundary checking & negative indexing, and contiguous memory
%timeit naive_convolve(f, g)

21.9 ms ± 519 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [168]:
# Cython version with memory views, deactivated boundary checking & negative indexing, contiguous memory, and inferred and fused types
%timeit naive_convolve(f, g)

21.8 ms ± 381 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Numba magic

In [183]:
import numpy as np
def naive_convolve(f, g):
    # f is an image and is indexed by (v, w)
    # g is a filter kernel and is indexed by (s, t),
    #   it needs odd dimensions
    # h is the output image and is indexed by (x, y),
    #   it is not cropped
    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
        raise ValueError("Only odd dimensions on filter supported")
    # smid and tmid are number of pixels between the center pixel
    # and the edge, ie for a 5x5 filter they will be 2.
    #
    # The output size is calculated by adding smid, tmid to each
    # side of the dimensions of the input image.
    vmax = f.shape[0]
    wmax = f.shape[1]
    smax = g.shape[0]
    tmax = g.shape[1]
    smid = smax // 2
    tmid = tmax // 2
    xmax = vmax + 2*smid
    ymax = wmax + 2*tmid
    # Allocate result image.
    h = np.zeros([xmax, ymax], dtype=f.dtype)
    # Do convolution
    for x in range(xmax):
        for y in range(ymax):
            # Calculate pixel value for h at (x,y). Sum one component
            # for each pixel (s, t) of the filter g.
            s_from = max(smid - x, -smid)
            s_to = min((xmax - x) - smid, smid + 1)
            t_from = max(tmid - y, -tmid)
            t_to = min((ymax - y) - tmid, tmid + 1)
            value = 0
            for s in range(s_from, s_to):
                for t in range(t_from, t_to):
                    v = x - smid + s
                    w = y - tmid + t
                    value += g[smid - s, tmid - t] * f[v, w]
            h[x, y] = value
    return h

In [185]:
pairwise_numba = numba.jit(naive_convolve)

In [189]:
# Numba version
%timeit pairwise_numba(f, g)

74.4 ms ± 5.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# My test with Cython and Numba

In [230]:
def getMean(arr):
    length = len(arr)
    rlt = 0
    for x in arr:
        rlt += x / length
    return rlt

In [231]:
%%cython -a
# cython: infer_types=True

cimport cython
import numpy as np

@cython.cdivision(True)
@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)   # Deactivate negative indexing
def getMean_cython(double [::1] arr):
    length = arr.shape[0]
    cdef double rlt = 0
    for i in range(length):
        rlt = rlt + arr[i] / length
    return rlt

In [321]:
arr = np.random.rand(5000000)

In [233]:
# Pure python
%timeit getMean(arr)

2.14 s ± 47.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [324]:
# Numpy
%timeit np.mean(arr)

6.64 ms ± 161 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [235]:
getMean_numba = numba.jit(getMean)

In [322]:
# Numba
%timeit getMean_numba(arr)

5.97 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [323]:
# Cython
%timeit getMean_cython(arr)

6.18 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Draft

In [3]:
@numba.jit(nopython=True)
def sum2d(arr):
    M, N = arr.shape
    result = 0.0
    for i in range(M):
        for j in range(N):
            result += arr[i,j]
    return result

In [4]:
arr = np.random.rand(50000, 5000)

In [8]:
startTime = time.time()
sum2d(arr)
print("Elapsed time: {}".format(time.time() - startTime))

Elapsed time: 0.32689476013183594


In [6]:
%timeit sum2d(arr)

324 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [297]:
arr = np.random.rand(10000, 2)
arr = np.ascontiguousarray(arr)

In [268]:
%timeit arr[:, 0] + arr[:, 1]

9.59 µs ± 317 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [303]:
from numba import vectorize, float64

@vectorize([float64(float64, float64)], target='cuda')
def f(x, y):
    return x + y

In [313]:
@numba.njit
def f_jib(arr):
    M, N = arr.shape
    rlt = np.zeros(M)
    for i in range(M):
        rlt[i] = arr[i, 0] + arr[i, 1]
    return rlt

In [304]:
%timeit f(np.ascontiguousarray(arr[:, 0]), np.ascontiguousarray(arr[:, 1]))

The slowest run took 5.82 times longer than the fastest. This could mean that an intermediate result is being cached.
1.91 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [315]:
%timeit f_jib(arr)

10.7 µs ± 51.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [320]:
f_jib.inspect_types()

f_jib (array(float64, 2d, C),)
--------------------------------------------------------------------------------
# File: <ipython-input-313-bf1bf9de8d18>
# --- LINE 1 --- 
# label 0
#   del $0.2
#   del $0.5
#   del $0.3
#   del N
#   del $0.4
#   del $0.6
#   del $0.7
#   del $0.9

@numba.njit

# --- LINE 2 --- 

def f_jib(arr):

    # --- LINE 3 --- 
    #   arr = arg(0, name=arr)  :: array(float64, 2d, C)
    #   $0.2 = getattr(value=arr, attr=shape)  :: (int64 x 2)
    #   $0.5 = exhaust_iter(value=$0.2, count=2)  :: (int64 x 2)
    #   $0.3 = static_getitem(value=$0.5, index=0, index_var=None)  :: int64
    #   $0.4 = static_getitem(value=$0.5, index=1, index_var=None)  :: int64
    #   M = $0.3  :: int64
    #   N = $0.4  :: int64

    M, N = arr.shape

    # --- LINE 4 --- 
    #   $0.6 = global(np: <module 'numpy' from 'C:\\Users\\justin.duan\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\numpy\\__init__.py'>)  :: Module(<module 'numpy' from 'C:\\Users\\justin.duan\\