# Advanced NumPy

In [1]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## ndarray Object Internals

In [2]:
np.ones((10, 5)).shape

(10, 5)

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [4]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

In [5]:
ints.dtype

dtype('uint16')

In [6]:
np.issubdtype(ints.dtype, np.integer)

True

In [7]:
np.issubdtype(floats.dtype, np.floating)

True

In [13]:
np.dtype('S') == np.string_

True

<mark>all of the parent classes of a specific dtype</mark>

In [14]:
np.issubdtype(np.float64, np.float)

True

In [8]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [9]:
np.issubdtype(ints.dtype, np.number)

True

## Advanced Array Manipulation

### Reshaping Arrays

In [15]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [16]:
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [17]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

> One of the passed shape dimensions can be –1, in which case the value used for that dimension will be inferred from the data:

In [18]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [19]:
other_arr = np.ones((3, 5))
other_arr.shape

(3, 5)

In [20]:
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [21]:
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [22]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

<mark>difference: ravel returns a view, flatten returns a copy</mark>

In [24]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus Fortran Order

In [25]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [26]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [27]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [28]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [29]:
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [30]:
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [31]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [32]:
arr = np.random.randn(5, 2)
arr

array([[-0.2047,  0.4789],
       [-0.5194, -0.5557],
       [ 1.9658,  1.3934],
       [ 0.0929,  0.2817],
       [ 0.769 ,  1.2464]])

<mark>np.split can take index spliter or number of splits</mark>

In [33]:
first, second, third = np.split(arr, [1, 3])

In [34]:
first, second, third

(array([[-0.2047,  0.4789]]),
 array([[-0.5194, -0.5557],
        [ 1.9658,  1.3934]]),
 array([[0.0929, 0.2817],
        [0.769 , 1.2464]]))

#### Stacking helpers: r_ and c_

In [35]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))

In [36]:
arr2 = np.random.randn(3, 2)

In [37]:
np.r_[arr1, arr2] #row

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 1.0072, -1.2962],
       [ 0.275 ,  0.2289],
       [ 1.3529,  0.8864]])

In [32]:
np.c_[np.r_[arr1, arr2], arr] #columns

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 1.0072, -1.2962,  3.    ],
       [ 0.275 ,  0.2289,  4.    ],
       [ 1.3529,  0.8864,  5.    ]])

In [33]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: tile and repeat

In [56]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [57]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

> By default, if you pass an integer, each element will be repeated that number of times. If you pass an array of integers, each element can be repeated a different number of times:

In [58]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [59]:
arr = np.random.randn(2, 2)
arr

array([[-0.5397,  0.477 ],
       [ 3.2489, -1.0212]])

In [60]:
arr.repeat(2, axis=0)

array([[-0.5397,  0.477 ],
       [-0.5397,  0.477 ],
       [ 3.2489, -1.0212],
       [ 3.2489, -1.0212]])

In [50]:
arr.repeat([2, 3], axis=0)

array([[-2.0016, -0.3718],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386],
       [ 1.669 , -0.4386],
       [ 1.669 , -0.4386]])

In [51]:
arr.repeat([2, 3], axis=1)

array([[-2.0016, -2.0016, -0.3718, -0.3718, -0.3718],
       [ 1.669 ,  1.669 , -0.4386, -0.4386, -0.4386]])

In [52]:
arr

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

<mark>repeat is elementwise, tile is for whole array</mark>

In [53]:
np.tile(arr, 2)

array([[-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386]])

In [54]:
arr
np.tile(arr, (2, 1))

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

In [55]:
np.tile(arr, (3, 2))

array([[-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386],
       [-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386],
       [-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386]])

### Fancy Indexing Equivalents: take and put

In [61]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [62]:
arr.take(inds)

array([700, 100, 200, 600])

In [63]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [48]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

> put does not accept an axis argument but rather indexes into the flattened (onedimensional, C order) version of the array.

In [49]:
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr

array([[-0.5397,  0.477 ,  3.2489, -1.0212],
       [-0.5771,  0.1241,  0.3026,  0.5238]])

In [50]:
arr.take(inds, axis=1)

array([[ 3.2489, -0.5397,  3.2489,  0.477 ],
       [ 0.3026, -0.5771,  0.3026,  0.1241]])

## Broadcasting

In [67]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [68]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [69]:
arr = np.random.randn(4, 3)
arr.mean(0)

array([-0.5472, -0.7081, -0.831 ])

In [70]:
demeaned = arr - arr.mean(0)
demeaned

array([[-0.7188,  0.8279, -0.2325],
       [ 0.8801, -1.6513,  0.6315],
       [-0.9948, -0.2627, -0.476 ],
       [ 0.8335,  1.0861,  0.0771]])

In [71]:
demeaned.mean(0)

array([0., 0., 0.])

In [72]:
arr

array([[-1.2659,  0.1198, -1.0635],
       [ 0.3329, -2.3594, -0.1995],
       [-1.542 , -0.9707, -1.307 ],
       [ 0.2863,  0.378 , -0.7539]])

In [73]:
row_means = arr.mean(1)
row_means

array([-0.7365, -0.742 , -1.2733, -0.0299])

In [74]:
row_means.shape

(4,)

In [75]:
row_means.reshape((4, 1))

array([[-0.7365],
       [-0.742 ],
       [-1.2733],
       [-0.0299]])

In [76]:
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([-0., -0.,  0.,  0.])

> **The Broadcasting Rule**: Two arrays are compatible for broadcasting if for each trailing dimension (i.e., starting from the end) the axis lengths match or if either of the lengths is 1. Broadcasting is then performed over the missing or length 1 dimensions.

### Broadcasting Over Other Axes

In [63]:
arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [77]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.5294,  0.8564, -0.327 ],
       [ 1.0749, -1.6174,  0.5425],
       [-0.2687,  0.3025, -0.0338],
       [ 0.3162,  0.4078, -0.724 ]])

In [78]:
arr = np.zeros((4, 4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [79]:
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_3d

array([[[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]]])

In [80]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]

array([[0.3313],
       [1.3497],
       [0.0699]])

In [81]:
arr_1d[np.newaxis, :]

array([[0.3313, 1.3497, 0.0699]])

In [82]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means

array([[ 0.3295, -0.1133,  0.1215, -1.1062],
       [ 0.5973, -0.3991,  0.5856,  0.1644],
       [ 0.9766, -0.2917, -0.4086,  0.508 ]])

In [83]:
depth_means.shape

(3, 4)

In [84]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[ 0.,  0.,  0.,  0.],
       [ 0., -0.,  0., -0.],
       [-0.,  0.,  0., -0.]])

```python
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]
```

### Setting Array Values by Broadcasting

In [85]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [86]:
col = np.array([1.28, -0.42, 0.44, 1.6])
col[:, np.newaxis]

array([[ 1.28],
       [-0.42],
       [ 0.44],
       [ 1.6 ]])

<mark>[:] is not a view</mark>

In [87]:
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [88]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc Usage

### ufunc Instance Methods

In [89]:
arr = np.arange(10)
np.add.reduce(arr)

45

In [90]:
arr.sum()

45

In [91]:
np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr

array([[-0.9815, -0.09  ,  0.3658,  0.7483,  0.7594],
       [-0.3154, -0.8661,  0.0279, -0.4556, -1.6019],
       [-0.8487, -0.5465, -0.3215,  0.0005,  0.2483],
       [ 0.2539,  1.9368, -0.7995, -0.5692,  0.0489],
       [-0.9535, -0.6491, -0.4795,  0.1754,  1.4225]])

In [92]:
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

In [93]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

In [94]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [95]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [96]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [97]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

In [98]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

### Writing New ufuncs in Python

In [99]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [100]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [101]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)

1.32 ms ± 16.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [102]:
%timeit np.add(arr, arr)

2.85 µs ± 53.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Structured and Record Arrays

In [103]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

In [104]:
sarr[0]

(1.5, 6)

In [105]:
sarr[0]['y']

6

In [106]:
sarr['x']

array([1.5   , 3.1416])

In [107]:
sarr.dtype

dtype([('x', '<f8'), ('y', '<i4')])

In [108]:
sarr.dtype.names

('x', 'y')

### Nested dtypes and Multidimensional Fields

In [109]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [110]:
arr[0]['x']

array([0, 0, 0])

In [111]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [112]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

In [106]:
data['y']

array([5, 6], dtype=int32)

In [107]:
data['x']['a']

array([1., 3.])

### Why Use Structured Arrays?

## More About Sorting

In [113]:
arr = np.random.randn(6)
arr.sort()

In [114]:
arr

array([-1.082 ,  0.3759,  0.8014,  1.1397,  1.2888,  1.8413])

In [115]:
arr = np.random.randn(3, 5)
arr

array([[-0.3318, -1.4711,  0.8705, -0.0847, -1.1329],
       [-1.0111, -0.3436,  2.1714,  0.1234, -0.0189],
       [ 0.1773,  0.7424,  0.8548,  1.038 , -0.329 ]])

In [116]:
arr[:, 0].sort()  # Sort first column values in-place
arr

array([[-1.0111, -1.4711,  0.8705, -0.0847, -1.1329],
       [-0.3318, -0.3436,  2.1714,  0.1234, -0.0189],
       [ 0.1773,  0.7424,  0.8548,  1.038 , -0.329 ]])

In [117]:
arr = np.random.randn(5)
arr

array([-1.1181, -0.2415, -2.0051,  0.7379, -1.0614])

In [118]:
np.sort(arr)

array([-2.0051, -1.1181, -1.0614, -0.2415,  0.7379])

In [119]:
arr

array([-1.1181, -0.2415, -2.0051,  0.7379, -1.0614])

In [120]:
arr = np.random.randn(3, 5)
arr

array([[ 0.5955, -0.2682,  1.3389, -0.1872,  0.9111],
       [-0.3215,  1.0054, -0.5168,  1.1925, -0.1989],
       [ 0.3969, -1.7638,  0.6071, -0.2222, -0.2171]])

In [121]:
arr.sort(axis=1)
arr

array([[-0.2682, -0.1872,  0.5955,  0.9111,  1.3389],
       [-0.5168, -0.3215, -0.1989,  1.0054,  1.1925],
       [-1.7638, -0.2222, -0.2171,  0.3969,  0.6071]])

<mark>take reverse sort, arr[::-1, :], arr[:, ::-1]</mark>

In [122]:
arr[:, ::-1]

array([[ 1.3389,  0.9111,  0.5955, -0.1872, -0.2682],
       [ 1.1925,  1.0054, -0.1989, -0.3215, -0.5168],
       [ 0.6071,  0.3969, -0.2171, -0.2222, -1.7638]])

### Indirect Sorts: argsort and lexsort

In [123]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0])

In [124]:
values[indexer]

array([0, 1, 2, 3, 5])

In [125]:
arr = np.random.randn(3, 5)
arr[0] = values
arr

array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [-0.3636, -0.1378,  2.1777, -0.4728,  0.8356],
       [-0.2089,  0.2316,  0.728 , -1.3918,  1.9956]])

In [126]:
arr[:, arr[0].argsort()]

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-0.1378,  2.1777,  0.8356, -0.4728, -0.3636],
       [ 0.2316,  0.728 ,  1.9956, -1.3918, -0.2089]])

In [127]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter

array([1, 2, 3, 0, 4])

<mark>order the data starts with the last array passed.</mark>

In [128]:
list(zip(last_name[sorter], first_name[sorter]))

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### Alternative Sort Algorithms

In [129]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer

array([2, 3, 4, 0, 1])

In [130]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

### Partially Sorting Arrays

In [131]:
np.random.seed(12345)
arr = np.random.randn(20)
arr

array([-0.2047,  0.4789, -0.5194, -0.5557,  1.9658,  1.3934,  0.0929,
        0.2817,  0.769 ,  1.2464,  1.0072, -1.2962,  0.275 ,  0.2289,
        1.3529,  0.8864, -2.0016, -0.3718,  1.669 , -0.4386])

<mark>k-th smallest element</mark>

In [132]:
np.partition(arr, 3)

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

In [141]:
indices = np.argpartition(arr, 3)
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

In [142]:
arr.take(indices)

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

### numpy.searchsorted: Finding Elements in a Sorted Array

In [136]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

In [137]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

In [138]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])

array([0, 3])

In [139]:
arr.searchsorted([0, 1], side='right')

array([3, 7])

In [140]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

In [141]:
labels = bins.searchsorted(data)
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

In [142]:
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

## Writing Fast NumPy Functions with Numba

In [150]:
import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

```python
In [209]: x = np.random.randn(10000000)

In [210]: y = np.random.randn(10000000)

In [211]: %timeit mean_distance(x, y)
1 loop, best of 3: 2 s per loop

In [212]: %timeit (x - y).mean()
100 loops, best of 3: 14.7 ms per loop
```

```python
In [213]: import numba as nb

In [214]: numba_mean_distance = nb.jit(mean_distance)
```

```python
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count
```

```python
In [215]: %timeit numba_mean_distance(x, y)
100 loops, best of 3: 10.3 ms per loop
```

```python
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()
```

### Creating Custom numpy.ufunc Objects with Numba

```python
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y
```

```python
In [13]: x = np.arange(10)

In [14]: nb_add(x, x)
Out[14]: array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.])

In [15]: nb_add.accumulate(x, 0)
Out[15]: array([  0.,   1.,   3.,   6.,  10.,  15.,  21.,  28.,  36.,  45.])
```

## Advanced Array Input and Output

### Memory-Mapped Files

In [151]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [152]:
section = mmap[:5]

In [153]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [154]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[ 0.7584, -0.6605,  0.8626, ...,  0.6046, -0.6212,  2.0542],
        [-1.2113, -1.0375,  0.7093, ..., -1.4117, -0.1719, -0.8957],
        [-0.1419, -0.3375,  0.4329, ...,  1.2914, -0.752 , -0.44  ],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [155]:
%xdel mmap
!rm mymmap

### HDF5 and Other Array Storage Options

## Performance Tips

### The Importance of Contiguous Memory

In [None]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous

In [None]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

In [None]:
arr_f.copy('C').flags

In [None]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

In [None]:
%xdel arr_c
%xdel arr_f

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS