In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext cython
%timeit

import Cython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [3]:
Cython.__version__


'0.29.28'

### Find in list: `.index` method

We want to make a function that gets as input an iterable `l` and an element `x` and returns the first position `k` in which there element appars. In other words `l[k]==x`

In [4]:

ids = [str(i) for i in range(1_000_000)]
q = '500000'

In [5]:
%timeit ids.index(q)

6.65 ms ± 259 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
assert ids[ids.index(q)] == q

In [7]:
%%cython -a
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int index_cython(list l, str q):
    cdef:
        int pos = 0, k = 0
        int n_l = len(l)
        str u

    for k in range(n_l):
        u = l[k]
        if u == q:
            return pos
        else:
            pos = pos +1
    raise ValueError



In [8]:
%timeit index_cython(ids,q)

2.79 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
assert ids[index_cython(ids,q)] == q

#### Restrict the domain: Assume list of uuids

Let´s assume the list is not arbitary but a list where all elements have the same length in memory.

For example we can assume we work with `uuid` objects that have 32 hexadecimal values

In [10]:
import uuid
len(uuid.uuid4().hex)

32

In [42]:
uuids = ['0'*(32-len(hex(i)))+ hex(i) for i in range(1000_000)]
uuids_bytes = [bytes(x,encoding='ascii') for x in uuids]
q = '0'*(32-len(hex(500_000)))+ hex(500_000) 
q = bytes(q, encoding='ascii')

An alternative way

In [44]:
# create data set:
N=10**6
l = [format(i,'016d').encode() for i in range(N)]
q = l[N-1]

In [45]:
q

b'0000000000999999'

In [16]:
%%cython -a
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int index_cython_bytes(list uuids, bytes q):
    cdef:
        int pos = 0, k = 0
        int n_uuids = len(uuids)
        bytes u

    for k in range(n_uuids):
        u = uuids[k]
        if u == q:
            return pos
        else:
            pos = pos +1

    raise ValueError

In [46]:
index_cython_bytes(l, q)

999999

In [18]:
%%cython -a
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int index_cython_2(list uuids not None, bytes q):
    cdef:
        int pos = 0, k = 0
        int n_uuids = len(uuids)
        bytes u

    for k in range(n_uuids):
        u = uuids[k]
        if u == q:
            return pos
        else:
            pos = pos +1

    raise ValueError


Error compiling Cython file:
------------------------------------------------------------
...
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int index_cython_2(list uuids not None, bytes q):
                        ^
------------------------------------------------------------

/Users/davidbuchaca/.ipython/cython/_cython_magic_c74b1c61a42ba5a4a8afdf7390136595.pyx:5:25: 'not None' only allowed in Python functions



In [22]:
%%cython -a
cimport cython
cimport numpy as np
from libc.string cimport memcmp


@cython.boundscheck(False)
@cython.wraparound(False)
def search_cython_c( np.uint8_t[::1] data, np.uint8_t[::1] key):
    cdef int size = len(key)
    cdef int n = len(data)//size
    cdef int i
    for i in range(n):
        if memcmp(<void*>&key[0], <void*>&data[i*size], size) == 0:
            return i
    raise ValueError

In file included from /Users/davidbuchaca/.ipython/cython/_cython_magic_223ecdd469f110ed066259aa09101040.c:711:
In file included from /Users/davidbuchaca/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/arrayobject.h:5:
In file included from /Users/davidbuchaca/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarrayobject.h:12:
In file included from /Users/davidbuchaca/opt/anaconda3/lib/python3.8/site-packages/numpy/core/include/numpy/ndarraytypes.h:1960:
 ^
  0, /*tp_print*/
  ^
/Users/davidbuchaca/opt/anaconda3/include/python3.8/cpython/object.h:260:5: note: 'tp_print' has been explicitly marked deprecated here
    Py_DEPRECATED(3.8) int (*tp_print)(PyObject *, FILE *, int);
    ^
/Users/davidbuchaca/opt/anaconda3/include/python3.8/pyport.h:515:54: note: expanded from macro 'Py_DEPRECATED'
#define Py_DEPRECATED(VERSION_UNUSED) __attribute__((__deprecated__))
                                                     ^
  0, /*tp_print*/
  ^
/Users/davidb

In [26]:
import numpy as np
l_as_np = np.array(l)

search_cython_c(sh_data_as_np.view(np.uint8), 
                np.array([l_as_np[N-1]]).view(np.uint8))  # 4.1 ms ± 148 µs


999999

results

In [30]:
%timeit l.index(q)

15.9 ms ± 955 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%timeit index_cython_bytes(l, q)

9.84 ms ± 522 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
%timeit search_cython_c(sh_data_as_np.view(np.uint8), np.array([l_as_np[N-1]]).view(np.uint8))  # 4.1 ms ± 148 µs

6.91 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
