In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np

In [2]:
# main type ndarray

arr = np.array([1, 2])
type(arr)
isinstance(arr, np.ndarray)

numpy.ndarray

True

floating data representation : matissa + exponent

e.g. ```6.573*10^5``` - mantissa is 6.573 and exponent is 5

* single precision: 32 bits overall, 23 bits for mantissa, 8 bits for exponent
* double precision: 64 bits overall, 52 bins for mantissa, 11 bits for exponent

double precision should be use in most numerical computations (types like np.float64)

* ndarray - collection of items of the same type
* indexed by integers
* homogeneous - all block are interpreted in exactly the same way

* each object interpretation - specified by *data-type object*
* array elements can be built-in and custom structures and classes

In [3]:
arr = np.array([[1, 2], [2, 5]], dtype=np.int32)  # can construct using a dtype
arr.dtype
arr.shape

# usual indexing and slicing works
arr[1, 1]
arr[(1, 1)]  # can index with a tuple

# ndarray is mutable
arr[0,1] = -18
arr

dtype('int32')

(2, 2)

5

5

array([[  1, -18],
       [  2,   5]], dtype=int32)

In [4]:
np.ndarray((3, 5), dtype=np.float64)  # constructor, filled with some garbage

array([[4.68175515e-310, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000]])

Numpy keeps data in a single sequential array in memory. Multidimensional structure can be based on row-major (C-style) or column major (F-style) order

Array stored in a single sequence also called contiguous array (C-contiguous, F-contiguous)

In [5]:
np.ndarray((2, 2), buffer=np.array([1, 2, 3, 4]), dtype=int)  # when passing a buffer need to specify dtype ! to decode correctly

np.ndarray((2, 2), buffer=np.array([1, 2, 3, 4]), dtype=int, order='F')  # Fstyle order (column based)

array([[1, 2],
       [3, 4]])

array([[1, 3],
       [2, 4]])

### other constructors

In [6]:
np.ones((4, 5), dtype=bool)
np.ones((4, 5), dtype=np.float64)

np.zeros((4, 3), dtype=int)

np.full(shape=(2,3), fill_value='hello')

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

array([['hello', 'hello', 'hello'],
       ['hello', 'hello', 'hello']], dtype='<U5')

## Methods

a lot of methods exists as array object methods and as np functions (like np.any(arr) or arr.any())

In [7]:
arr = np.array([1, 3, 4, 6, 2, 5, 3, 4, 7, 6, 4])

In [8]:
arr.any()  # if any element is True/non-zero
np.zeros(4).any()

True

False

In [9]:
# argmax, argmin

arr.argsort()  # returns indices that would sort the array

array([ 0,  4,  1,  6,  2,  7, 10,  5,  3,  9,  8])

In [10]:
# casting

arr.astype(float)

arr.astype(str)

array([1., 3., 4., 6., 2., 5., 3., 4., 7., 6., 4.])

array(['1', '3', '4', '6', '2', '5', '3', '4', '7', '6', '4'],
      dtype='<U21')

### Endianness

The smallest data group with an address is eight bits long and is called a byte. Larger groups comprise two or more bytes, for example a 32-bit word contains four bytes. There are two possible ways a computer could number the individual bytes in a larger group, starting at either end, and over the course of computer history, both methods have been used, leading to occasional problems.

Internally, any given computer will work equally well regardless of what endianness it uses, since its hardware will consistently use the same endianness to both store and load its data. For this reason, programmers and computer users normally ignore the endianness of the computer they are working with. 

However, endianness can become an issue when moving data external to the computer – as when transmitting data between different computers, or a programmer investigating internal computer bytes of data from a memory dump – and the endianness used differs from expectation. In these cases, the endianness of the data must be understood and accounted for.

In [11]:
arr.byteswap()  # toggle between big-endian and little-endian
# on any given computer one part will be a mess, another one - true numbers; but for external data may need to swap byte order to get real values
arr.byteswap().byteswap()

array([ 72057594037927936, 216172782113783808, 288230376151711744,
       432345564227567616, 144115188075855872, 360287970189639680,
       216172782113783808, 288230376151711744, 504403158265495552,
       432345564227567616, 288230376151711744])

array([1, 3, 4, 6, 2, 5, 3, 4, 7, 6, 4])

In [12]:
np.choose([1, 0, 1], [[2, 3, 3], [8, 4, -2]])  # choose 1st, 2nd, 3rd etc elements from different arrays for which the numbers are defined in the first index arg

# clip

array([ 8,  3, -2])

In [13]:
np.arange(6).reshape(3, 2, order='F')
arr = np.arange(6).reshape(3, 2)  # order is 'C' by default
arr

arr.flatten()
arr.flatten(order='F')


arr.compress([True, False, True], axis=0)  # apply bool array mask along given axis

array([[0, 3],
       [1, 4],
       [2, 5]])

array([[0, 1],
       [2, 3],
       [4, 5]])

array([0, 1, 2, 3, 4, 5])

array([0, 2, 4, 1, 3, 5])

array([[0, 1],
       [4, 5]])

In [14]:
# copy, cumsum, cumprod
arr.cumsum()  # will flatten the array if no dim specified
np.cumsum(arr, axis=0)

arr.f

array([ 0,  1,  3,  6, 10, 15])

array([[0, 1],
       [2, 4],
       [6, 9]])

AttributeError: 'numpy.ndarray' object has no attribute 'f'

In [15]:
arr = np.arange(12).reshape(4, 3)
arr

arr.diagonal()  # one of the matrix diagonals
arr.diagonal(offset=1)
arr.diagonal(offset=-1)

arr.trace(offset=-1)  # same rules but calculates the diagonal value sum (trace)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

array([0, 4, 8])

array([1, 5])

array([ 3,  7, 11])

21

In [16]:
np.arange(4).dot(np.array([1, 1, -1, 1]))  # array dot-product

arr.T.dot(np.array([1, 1, -1, 1]))  # matrix-vector dot-product

arr = np.arange(4)
arr.shape  # one-dim array - cannot transpose
arr.reshape(1, 4)  # need to create 2d array, then can transpose and use in matrix operations
# 1d array can be used in matrix products if the required shape is (len(arr), 1)

2

array([ 6,  8, 10])

(4,)

array([[0, 1, 2, 3]])

In [17]:
# np.dump(file), np.load(file) - pickle for np arrays

# min, max, mean, nonzero, std, sum, squeeze, var

In [18]:
# ravel vs flatten : both functions for flattening the array

arr = np.arange(6).reshape(3, 2)
arr

arr_flat = arr.flatten()  # returns a copy
arr_flat[0] = -18
arr

arr_flat = arr.ravel()  # return a view of the original array where possible
arr_flat[0] = -18
arr

# representations of the original arrays are called views (i.e. if we change smth in the view - we'll see the change in the original array)

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[-18,   1],
       [  2,   3],
       [  4,   5]])

In [19]:
np.repeat(np.arange(3), 5)

# np.repeat(np.arange(3), 5, axis=1)  # not possible
np.repeat(np.arange(3).reshape(3, 1), 5, axis=1)  # ok after creating a 2d array

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

array([[0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2]])

In [20]:
np.round([2.43, 1.45], 1)

array([2.4, 1.4])

In [21]:
arr
arr.flags  # some useful info; can also set flags using .setflags(..)

array([[-18,   1],
       [  2,   3],
       [  4,   5]])

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [22]:
# sort

a = np.array([[1,4],[3,1]])
np.sort(a)                # sort along the last axis

np.sort(a, axis=None)     # sort the flattened array

np.sort(a, axis=0)        # sort along the first axis

array([[1, 4],
       [1, 3]])

array([1, 1, 3, 4])

array([[1, 1],
       [3, 4]])

In [23]:
arr = np.arange(6).reshape(3, 2)
arr.take([(1, 2), (0, 2), (1,0)])  # select values for a groups of indices

np.arange(1, 5)[[1, 2]]  # simpler approach will work for 1d array 

array([[1, 2],
       [0, 2],
       [1, 0]])

array([2, 3])

In [24]:
arr.tobytes()  # convert to binary string

b'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00'

In [25]:
arr.tolist()

[[0, 1], [2, 3], [4, 5]]

In [26]:
arr
arr.swapaxes(1, 0)  # same as transpose

arr3d = np.arange(27).reshape(3, 3, 3)
arr3d
arr3d.swapaxes(0, 2)  # swap any pair of axes

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[0, 2, 4],
       [1, 3, 5]])

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

array([[[ 0,  9, 18],
        [ 3, 12, 21],
        [ 6, 15, 24]],

       [[ 1, 10, 19],
        [ 4, 13, 22],
        [ 7, 16, 25]],

       [[ 2, 11, 20],
        [ 5, 14, 23],
        [ 8, 17, 26]]])

In [27]:
arr = np.array([[1, 2, 3], [5, 2, 6]], dtype=np.int64)
arr

# generating new data views of the same array (the same bytes in memory interpreted for a different dtype, not type casting!)
arr.view(dtype=bool)
arr.view(dtype=float)
arr.view(dtype=np.int32)

mx = arr.view(type=np.matrix)  # can also pass numpy data type (e.g. view arr as matrix)
mx
mx[0, 0] = -20  # it's a view, so points to the same data, and can change it
arr

array([[1, 2, 3],
       [5, 2, 6]])

array([[ True, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False],
       [ True, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False]])

array([[4.9e-324, 9.9e-324, 1.5e-323],
       [2.5e-323, 9.9e-324, 3.0e-323]])

array([[1, 0, 2, 0, 3, 0],
       [5, 0, 2, 0, 6, 0]], dtype=int32)

matrix([[1, 2, 3],
        [5, 2, 6]])

array([[-20,   2,   3],
       [  5,   2,   6]])

In [28]:
# attributes
arr

arr.nbytes
arr.shape
arr.ndim
arr.size
arr.data

array([[-20,   2,   3],
       [  5,   2,   6]])

48

(2, 3)

2

6

<memory at 0x7f8100134ba0>

In [29]:
arr1 = np.array([2, 1, 4])
arr2 = np.array([3, 0, 3])

arr1 <= arr2  # same shape comparison
arr1 * arr2  # component-wise arythmetic operations

array([ True, False, False])

array([ 6,  0, 12])

In [30]:
# scalar arrays
scalar_arr = np.array(5)
scalar_arr

scalar_arr.shape
scalar_arr.sum()

array(5)

()

5

### array scalars

In [31]:
isinstance(4, np.int64)
isinstance(np.int64(4), np.int64)
isinstance(np.array(4), np.int64)

False

True

False

In [32]:
isinstance(4, np.object)

True

In [33]:
# numpy has a complicated hierarchy of types
# it supports native python types : np.int, np.float, np.object, np.bool etc
# but has many more type with extra info : like np.int32, np.int64, np.uint8 etc

x = np.int64(3)
type(x)
isinstance(x, np.int)
isinstance(x, np.integer)  # do not mix int (python built-in) and integer

x = 3
type(x)
# from python output it's not clear what kind of e.g. int we have

numpy.int64

False

True

int

In [34]:
# numpy supports scalars, and the scalar type hierarchy mimics (almost) the dtype hieararchy

x = np.array([1, 2, 3], dtype=np.int64)
type(x[1])
type(np.array(x[1]))  # scalar array is different though from numpy scalar

numpy.int64

numpy.ndarray

In [35]:
# numpy scalars have a lot of various method and attributes (support all of ndarray methods, though a lot are not implemented in numpy.generic - base class for all scalar types)

# all numpy scalars are inherited from numpy.generic that has all the same methods as ndarray
# but a lot of them don't make sense for scalars and are not implemented

np.int64(4).flags
np.int64(4).ndim
np.int64(4).sum()

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : False
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

0

4

In [36]:
x = np.int32(5)
isinstance(x, np.generic)

True

#### definining new type

* can subclass ndarray and define a method of interest : but this will only work to some extent; a lot of the things that work in ndarray are based on the check that this is ndarray, so some functionality will be lost
* the best way to define it in C using numpy C-API
* often inheritance is probably not needed, potentially a function or a new class which has ndarray as a data field (composition) will work well

### data types

* np.dtype - data type objects
* describe the type, size, byteorder (little/big endian), data element own features if it's a structured object
* a numpy scalar type is associated with each data type, any element extracted from ndarray via indexing will have this data type

**np.dtype objects are not the same as scalar type objects, but can used instead of one another when creating arrays**

In [37]:
np.dtype('<i4') == np.int32  # dtype and the corresponding scalar type

dt = np.dtype('>i4')  # > specifies big-endian byteorder
dt.byteorder
dt.name

isinstance('abs', np.unicode)  # unicode scalar type
np.array('abs').dtype  # unicode dtype '<U3' - 3 chars

True

'>'

'int32'

True

dtype('<U3')

### structured data type

In [38]:
# dtype can be specified by dict-like list of tuples
dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
dt
dt['name']
dt['grades']

dtype([('name', '<U16'), ('grades', '<f8', (2,))])

dtype('<U16')

dtype(('<f8', (2,)))

In [39]:
# can use structured data type for arrays with elements of mixed data
x = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
x

x['name']  # names used in the dtype structure can be used to access numpy array subarrays (just like in pandas)
x['grades']
x[1]
x[0]['name']

array([('Sarah', [8., 7.]), ('John', [6., 7.])],
      dtype=[('name', '<U16'), ('grades', '<f8', (2,))])

array(['Sarah', 'John'], dtype='<U16')

array([[8., 7.],
       [6., 7.]])

('John', [6., 7.])

'Sarah'

In [40]:
# np.dtype is a dtype constructor, can accept python types, list for of tuples (name, scalar type), type strings etc
np.dtype(int)
np.dtype(np.int64)
np.dtype('f8')  # type string

# dtype also has functions to change byteorder

dtype('int64')

dtype('int64')

dtype('float64')

In [41]:
dt = np.dtype(int)

# dtype object attributes
dt.str
dt.name
dt.num  # unique id number (for each dtype)
dt.type
dt.itemsize

'<i8'

'int64'

7

numpy.int64

8

In [42]:
# for structured dtypes
dt = np.dtype([('name', '<U24'), ('score', np.int64)])
dt

dt.fields  # returns a dict
dt.names

dtype([('name', '<U24'), ('score', '<i8')])

mappingproxy({'name': (dtype('<U24'), 0), 'score': (dtype('int64'), 96)})

('name', 'score')

In [43]:
dt.kind

# various dtype kinds (may be useful)
print("""
b boolean
i signed integer
u unsigned integer
f floating-point
c complex floating-point
m timedelta
M datetime
O object
S (byte-)string
U Unicode
V void
""")

'V'


b boolean
i signed integer
u unsigned integer
f floating-point
c complex floating-point
m timedelta
M datetime
O object
S (byte-)string
U Unicode
V void



In [44]:
x = np.array([1, 2, 3])
x.dtype.kind == 'i'

True

**numbers** module - hierarchy of abstract numerical classes (not of which can be instantiated)

In [45]:
from numbers import Number, Complex, Real, Rational, Integral

isinstance(np.int64(3), Integral)
isinstance(np.float64(3), Integral)
isinstance(np.float64(3), Real)
isinstance(np.float64(3), Rational)

True

False

True

False

In [46]:
# can use issubclass to check if numbers types encompass numpy types
issubclass(np.int64, Integral)
issubclass(np.float16, Real)

True

True

In [47]:
dt = np.dtype(('i4', (2, 3)))
dt.shape  # shape and subdtype for structured dtypes
dt.subdtype

(2, 3)

(dtype('int32'), (2, 3))

### indexing

In python in general ```x[i1, i2, i3]``` is equivalent to ```x[(i1, i2, i3)]```

In [48]:
# basic indexing

arr = np.arange(12)
arr[2:9:2]  # slicing with step

array([2, 4, 6, 8])

In [49]:
arr_new = arr[1:5, np.newaxis]  # np.newaxis slice is used to expand the dimension
arr_new
arr_new.shape, arr.shape

array([[1],
       [2],
       [3],
       [4]])

((4, 1), (12,))

In [50]:
arr = np.array([[1, 2, 3], [5, 3, 6]])
arr[1, :]  # ndim is reduced by one
arr[1:2, :]  # same ndim

array([5, 3, 6])

array([[5, 3, 6]])

In [51]:
# arr[(1, 2)] - basic indexing
# arr[(1, 2),] - will triger advanced indexing, arr[[1, 2]] is also advanced indexing

# integer and boolean advanced indexing

# integer indexing 
x = np.array([[1, 2], [3, 4], [5, 6]])
x[[0, 1, 2], [0, 1, 0]]  # select using row and columns indices
x[(0, 1)]  # to basic
x[[0, 1]]  # to advanced (starts from first dimensions)

array([1, 4, 5])

2

array([[1, 2],
       [3, 4]])

In [52]:
# boolean indexing
arr = np.arange(8)
arr[arr < 6]

# behaviour is similar to passing bool_obj.nonzero() integer index

array([0, 1, 2, 3, 4, 5])

In [53]:
(arr<6).nonzero()

(array([0, 1, 2, 3, 4, 5]),)

In [54]:
# structured array indexing

arr = np.array([[1, 4, 2], [2, 5, 4], [1, 2, 6]], dtype=np.dtype([('a', '<i8'), ('b', '<i8'), ('c', '<i8')]))
arr

array([[(1, 1, 1), (4, 4, 4), (2, 2, 2)],
       [(2, 2, 2), (5, 5, 5), (4, 4, 4)],
       [(1, 1, 1), (2, 2, 2), (6, 6, 6)]],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])

In [55]:
# to create structured array : need to pass array of tuples, and specify each tuple entry dtype in the dtype constructor (so structured array fields won't correspond to e.g. indiv. columns as in pandas)
dt = np.dtype([('a', 'i8'), ('b', 'i8'), ('c', 'i8')])
arr = np.array([(1, 4, 2), (2, 5, 4), (1, 2, 6)], dtype=dt)
arr
arr.shape
arr['b']  # we can access indiv. field dict-like (select single or several tuple entries from each tuple item)
arr[['a', 'c']]
arr['b'][1]

arr.astype(np.object)  # remove specification of the entry (leave general object)

array([(1, 4, 2), (2, 5, 4), (1, 2, 6)],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])

(3,)

array([4, 5, 2])

array([(1, 2), (2, 4), (1, 6)],
      dtype={'names':['a','c'], 'formats':['<i8','<i8'], 'offsets':[0,16], 'itemsize':24})

5

array([(1, 4, 2), (2, 5, 4), (1, 2, 6)], dtype=object)

In [56]:
arr = np.arange(9).reshape(3, 3)
arr

flat_it = arr.flat  # returns a flattened iterator
flat_it
list(flat_it)

np.nditer(arr)  # provides iterator, but over elements as scalar arrays

# can use nditer and ndararys with various specifications inside Cython code to increase performance (along with using numba or C-extension)
# check numpy C-API (!) for writing C-extensions

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

<numpy.flatiter at 0x562efb033170>

[0, 1, 2, 3, 4, 5, 6, 7, 8]

<numpy.nditer at 0x7f8100038170>

In [57]:
# default internal loop goes through elements one by one
# passing external_loop flag with allow numpy to use vectorized code and process larger chunks of data
np.nditer(arr, flags=['external_loop'])

# it's still likely better to use numba, Cython or C-extension for time-critical loops

<numpy.nditer at 0x7f81000383f0>

### It is not recommended to subclass ndarrays (!) Instead numpy dispatch mechanism should be used

### It is recommended not to use np.matrix subclass, it's used mostly for interacting with scipy.sparse, and eventually will be removed in the future  (even for linear algebra regular arrays are recommended)

### Chararray is not recommended to new development (exists for compatibility)

In [58]:
#  memory map
# used to map binary file to np.ndarray and access parts of the data without loading the whole file
# (useful for big files)

data = np.arange(12, dtype='float32')
data.resize((3,4))

from tempfile import mkdtemp  # useful for making temp dirs
from pathlib import Path
filename = Path(mkdtemp())/'newfile.dat'

# create memory map
fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))

# assign data to it
fp[:] = data[:]

del fp  # deletion flushed memory to disk (could also use .flush())

In [59]:
# can now read the data
newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))  # can specify filemode (r, rw, r+ etc)
newfp[1, 3]

7.0

### record arrays

array that allows field access using attributes

In [60]:
np.recarray  # ndarray subclass
np.record  # corresponding scalar class

numpy.recarray

numpy.record

In [61]:
# standard array creation with structured dtype
arr = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', '<f8'), ('y', '<i8')])
arr

arr['x']
#arr.x  # won't work

arr = arr.view(np.recarray)  # creating record array
arr.x

array([(1., 2), (3., 4)], dtype=[('x', '<f8'), ('y', '<i8')])

array([1., 3.])

array([1., 3.])

In [62]:
# recarray using constructor (names and formats arguments, and an array of tuples)
rec_arr = np.recarray(names=['a', 'b'], formats=['i8', 'f8'], 
        buf=np.array([(5, 4), (2, 6), (1, 2)]), shape=(3,))
rec_arr
rec_arr['a']


# recarray form 2D array (!) - interprets rows as corresponding to the recarray fields
arr = np.arange(6).reshape(2, 3)
rec_arr = np.core.records.fromarrays(arr, names=['a', 'b'], formats=['i8', 'f8'])
type(rec_arr)
rec_arr['b']
rec_arr['a']
rec_arr.dtype

# recarray has similar method for ndarray

rec.array([(5, 2.e-323), (2, 3.e-323), (1, 1.e-323)],
          dtype=[('a', '<i8'), ('b', '<f8')])

array([5, 2, 1])

numpy.recarray

array([3., 4., 5.])

array([0, 1, 2])

dtype((numpy.record, [('a', '<i8'), ('b', '<f8')]))

### masked arrays

```np.ma``` module - the same functionality as ndarray plus working with masks and missing data

In [63]:
arr = np.array([1, 2, 3, -1, 4])
mx = np.ma.masked_array(arr, mask=[0,0,0,1,0])  # 1 or True at the invalid values location
mx

mx.mean()  # mean without the invalid value

type(mx)  # MaskedArray - subclass of ndarray

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=999999)

2.5

numpy.ma.core.MaskedArray

In [64]:
mx = np.ma.masked_array(arr, mask=[0,0,0,1,0], fill_value=12)  # get add a fill_value
mx

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=12)

In [65]:
mx
mx.astype(float)  # standard ndarray methods work

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=12)

masked_array(data=[1.0, 2.0, 3.0, --, 4.0],
             mask=[False, False, False,  True, False],
       fill_value=12.0)

In [66]:
arr = np.arange(10)
arr_masked = np.ma.masked_where(arr > 5, arr)  # mask using a condition
arr_masked

arr_masked.mask  # accessing the mask
arr_masked.compressed()  # get only valid values

masked_array(data=[0, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[False, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

array([False, False, False, False, False, False,  True,  True,  True,
        True])

array([0, 1, 2, 3, 4, 5])

In [67]:
arr_masked
arr_masked[0] = np.ma.masked  # masking another value(s); assign any value to unmask instead
arr_masked

masked_array(data=[0, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[False, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

masked_array(data=[--, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[ True, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

In [68]:
# can perform some math operations on masked arrays
x = np.array([2, -1, 3, 4])
np.log(x)  # produced nan
np.ma.log(x)  # produces masked array

array([0.69314718,        nan, 1.09861229, 1.38629436])

masked_array(data=[0.6931471805599453, --, 1.0986122886681098,
                   1.3862943611198906],
             mask=[False,  True, False, False],
       fill_value=1e+20)

In [69]:
arr = np.array([1, 2, 3, -1, 2, -4])
arr_masked = np.ma.masked_values(arr, -1)  # masking certain value
arr_masked

arr_masked.filled(10)  # filling missing value

masked_array(data=[1, 2, 3, --, 2, -4],
             mask=[False, False, False,  True, False, False],
       fill_value=-1)

array([ 1,  2,  3, 10,  2, -4])

In [70]:
arr_masked
arr_masked[3] == np.ma.masked

masked_array(data=[1, 2, 3, --, 2, -4],
             mask=[False, False, False,  True, False, False],
       fill_value=-1)

masked

In [71]:
is_equal = arr_masked == arr_masked.copy()  # comparison keeps the mask structure
is_equal

is_equal.all()  # correct values

# compare with 
(np.array([1, 3, np.nan]) == np.array([1, 3, np.nan])).all()

masked_array(data=[True, True, True, --, True, True],
             mask=[False, False, False,  True, False, False],
       fill_value=True)

True

False

In [72]:
arr_nans = np.array([1, 2, 4, np.nan, 2])
arr_masked = np.ma.masked_invalid(arr_nans)  # array with nans to masked array
arr_masked

masked_array(data=[1.0, 2.0, 4.0, --, 2.0],
             mask=[False, False, False,  True, False],
       fill_value=1e+20)

## datetimes and timedeltas

numpy datetime data type is called 'datetime64' (to distinguish from python datetime)

In [83]:
np.datetime64('2005-02-25')  # creation from ISO string
np.datetime64('2005-02')  # using only month

np.datetime64('2005-02-25T03:30')  # datetime ISO

np.datetime64('nat')  # not a time, for missing values (like NaN - not a number)

numpy.datetime64('2005-02-25')

numpy.datetime64('2005-02')

numpy.datetime64('2005-02-25T03:30')

numpy.datetime64('NaT')

In [90]:
d = np.datetime64('2005-02')  
d.dtype  # getting dtype (month here)

d = np.datetime64('2005-02-04')
d.dtype  

np.datetime64('2002', 'M')  # forcing unit 
np.datetime64('2002', 'D') 

dtype('<M8[M]')

dtype('<M8[D]')

numpy.datetime64('2002-01')

numpy.datetime64('2002-01-01')

In [94]:
arr = np.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64')  # array with datetimes
arr
arr.dtype  # contains unit ['D'] - determined automatically, or can specify, like 'datetime64[D]'

np.array(['2001-01-01T12:00', '2002-02-03T13:56:03.172'], dtype='datetime64')

array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]')

dtype('<M8[D]')

array(['2001-01-01T12:00:00.000', '2002-02-03T13:56:03.172'],
      dtype='datetime64[ms]')

In [99]:
# ndarray with datetime work with many common numpy functions
np.arange('2005-02', '2005-03', dtype='datetime64[D]')

np.arange('2005-02', '2005-03', dtype='datetime64[W]')  # weekly frequency

array(['2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
       '2005-02-05', '2005-02-06', '2005-02-07', '2005-02-08',
       '2005-02-09', '2005-02-10', '2005-02-11', '2005-02-12',
       '2005-02-13', '2005-02-14', '2005-02-15', '2005-02-16',
       '2005-02-17', '2005-02-18', '2005-02-19', '2005-02-20',
       '2005-02-21', '2005-02-22', '2005-02-23', '2005-02-24',
       '2005-02-25', '2005-02-26', '2005-02-27', '2005-02-28'],
      dtype='datetime64[D]')

array(['2005-01-27', '2005-02-03', '2005-02-10', '2005-02-17'],
      dtype='datetime64[W]')

In [100]:
# auto cast to bigger unit
np.datetime64('2005') == np.datetime64('2005-01-01')

True

numpy does not store timezone information (!)

In [101]:
# numpy timedelta
np.timedelta64(1, 'D')

numpy.timedelta64(1,'D')

In [102]:
np.datetime64('2009-01-01') - np.datetime64('2008-01-01')

numpy.timedelta64(366,'D')

In [103]:
# cast to smaller unit -> set to the first instant
np.datetime64('2009') + np.timedelta64(20, 'D')

numpy.datetime64('2009-01-21')

In [104]:
# timedelta arythmetic works with different units
np.timedelta64(1,'W') % np.timedelta64(10,'D')

numpy.timedelta64(7,'D')

In [108]:
# business day functionality
np.busday_offset('2011-06-23', 2)

np.busday_offset('2011-06-25', 0, roll='forward')  # today or next
np.busday_offset('2011-06-25', 0, roll='backward')  # today or prev; etc

np.is_busday(np.datetime64('2011-07-15'))

np.busday_count(np.datetime64('2011-07-11'), np.datetime64('2011-07-18'))  # distance in bdays

numpy.datetime64('2011-06-27')

numpy.datetime64('2011-06-27')

numpy.datetime64('2011-06-24')

True

5

**Units** 

for dates: Y, M, W, D

for time: h, m, s, ms, us, ns, ps, fs, as

In [112]:
# datetime as string
d = np.arange('2002-10-27T04:30', 4*60, 60, dtype='M8[m]')
d

np.datetime_as_string(d, timezone='UTC')

import pytz
np.datetime_as_string(d, timezone=pytz.timezone('US/Eastern'))  # passing timezone

np.datetime_as_string(d, unit='h')  # passing bigger unit

array(['2002-10-27T04:30', '2002-10-27T05:30', '2002-10-27T06:30',
       '2002-10-27T07:30'], dtype='datetime64[m]')

array(['2002-10-27T04:30Z', '2002-10-27T05:30Z', '2002-10-27T06:30Z',
       '2002-10-27T07:30Z'], dtype='<U35')

array(['2002-10-27T00:30-0400', '2002-10-27T01:30-0400',
       '2002-10-27T01:30-0500', '2002-10-27T02:30-0500'], dtype='<U39')

array(['2002-10-27T04', '2002-10-27T05', '2002-10-27T06', '2002-10-27T07'],
      dtype='<U32')

In [120]:
import datetime as dt

#  d.astype('datetime64[D]') + dt.timedelta(days=4)  # won't work with python datetime
d.astype('datetime64[D]') + np.timedelta64(dt.timedelta(days=4))  # can convert datetime objects to np.datetime64 etc

array(['2002-10-31T00:00:00.000000', '2002-10-31T00:00:00.000000',
       '2002-10-31T00:00:00.000000', '2002-10-31T00:00:00.000000'],
      dtype='datetime64[us]')

## Universal functions (ufunc)

can be applied to ndarrays in element-wise fashion, instances of numpy.ufunc class

**broadcasting** 
* ufunc vectorizes functions that are normallly apply to scallars
* in some cases ufunc will work for ndarrays with inconsistent shapes:
    - if the sizes of the dimensions either coincide or some of the sizes are 1 (in this case the unique value is propagated along the dimension)
    - if the number of dimensions is smaller but 1's can be added to some dimensions to satisfy the previous property

In [124]:
# a lot of math and trigonometric ufunc are available 
np.sin([1, 2, 3])
np.add(np.array([1, 2, 3]), np.array([3, 2, 4]).reshape(1, 3))  # broadcasting

array([0.84147098, 0.90929743, 0.14112001])

array([[4, 4, 7]])

In [126]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

## Routines

array creation 

In [136]:
# empty, ones, zeros, full
# empty_like, ones_like, zeros_like, full_like
# eye, identity

np.full((3, 4), fill_value=-10)  # shape is passed as a tuple
np.ones(10, dtype=bool)  # can pass dtype

np.eye(3, 5)  # if second arg is passed - it's the number of cols that may be different
np.identity(2)  # always a square Id matrix

np.empty((3, 2))  # garbage values, better to use e.g. np.zeros

array([[-10, -10, -10, -10],
       [-10, -10, -10, -10],
       [-10, -10, -10, -10]])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])

array([[1., 0.],
       [0., 1.]])

array([[4.9e-324, 9.9e-324],
       [1.5e-323,      nan],
       [9.9e-324,      nan]])

creating array from existing objects

In [147]:
np.array([1, 2, 3])  # accepts array-like, for which the definition is hard to define in terms of the source code, so a better definition is simply objects for which np.array will work

np.array([1, 2, 3], dtype=float)  # passing dtype

np.array(np.mat('1 2; 3 4'))  # will convert ndarray subclasses to ndarray
np.array(np.mat('1 2; 3 4'), subok=True)  # let subclasses to pass through

arr = np.array([4, 3, 6])
arr_view = np.array(arr, copy=False)  # copy=False to generate a view rather than a copy, default copy=True
arr_view[0] = -1
arr
# copying will be applied if the new dtype is passed or a subclass object is passed and subok=False

array([1, 2, 3])

array([1., 2., 3.])

array([[1, 2],
       [3, 4]])

matrix([[1, 2],
        [3, 4]])

array([-1,  3,  6])

In [150]:
# methods similar to np.array but with different rules for copying
arr = np.array([2, 1, 3])

np.asarray(arr)  # no copy by default if the arg dtype and order match (compatible ndarray)
np.asanyarray(arr)  # no copy if compatible ndarray or subclass (same as copy=False, subok=True)
np.ascontiguousarray(arr)  # no copy if compatible ndarray and order='C' (copy=False, order='C')
np.asfortranarray(arr)  # same for order='F'
np.copy(arr)  # copy with the same order, has subok argument

# use copy.deepcopy to copy ndarray elements with all the depending memory (e.g. if it's an array of dicts - copy dicts, not their references)

array([2, 1, 3])

array([2, 1, 3])

array([2, 1, 3])

array([2, 1, 3])

array([2, 1, 3])

In [153]:
# create array from buffer object
s = b'hello world'  # binary string
np.frombuffer(s, dtype='S1', count=5, offset=6)  # can specify offset (skip) and count (how many bytes to read)

np.frombuffer(b'\x01\x02\x03\x04\x05', dtype=np.uint8, count=3)

# np.fromfile : efficient way to read data from binary file

array([b'w', b'o', b'r', b'l', b'd'], dtype='|S1')

array([1, 2, 3], dtype=uint8)

In [155]:
iterable = (x*x for x in range(5))
np.array(iterable)  # won't iterate through 
np.fromiter(iterable, dtype=float)  # create array from iterable 

array(<generator object <genexpr> at 0x7f80e028f510>, dtype=object)

array([ 0.,  1.,  4.,  9., 16.])

In [165]:
np.fromstring('1 2', dtype=int, sep=' ')

np.fromstring('12.42, 3.21', dtype=float, sep=', ')

array([1, 2])

array([12.42,  3.21])

### creating record arrays

np.rec is an alias for np.core.records and should be used

In [167]:
np.rec.fromrecords([(456,'dbe',1.2),(2,'de',1.3)],)

rec.array([(456, 'dbe', 1.2), (  2, 'de', 1.3)],
          dtype=[('f0', '<i8'), ('f1', '<U3'), ('f2', '<f8')])

In [172]:
np.rec.array([(1, b'a', 1.1), (2, b'dd', 2. ), (3, b'xyz', 3. ),(4, b'12', 4. )],
                    dtype=[('a', '<i4'), ('b', 'S3'), ('c', '<f4')])


rec_arr = np.rec.fromrecords([(456,'dbe',1.2),(2,'de',1.3)], names='col1,col2,col3')
rec_arr
rec_arr.dtype

a = b'\x01\x02\x03abc'
np.rec.fromstring(a, dtype='u1,u1,u1,S3')

rec.array([(1, b'a', 1.1), (2, b'dd', 2. ), (3, b'xyz', 3. ),
           (4, b'12', 4. )],
          dtype=[('a', '<i4'), ('b', 'S3'), ('c', '<f4')])

rec.array([(456, 'dbe', 1.2), (  2, 'de', 1.3)],
          dtype=[('col1', '<i8'), ('col2', '<U3'), ('col3', '<f8')])

dtype((numpy.record, [('col1', '<i8'), ('col2', '<U3'), ('col3', '<f8')]))

rec.array([(1, 2, 3, b'abc')],
          dtype=[('f0', 'u1'), ('f1', 'u1'), ('f2', 'u1'), ('f3', 'S3')])

### numerical ranges

In [176]:
np.arange(2, 20, 3)
np.linspace(10, 20, 15)  # start, stop and num of points

# also logspace, geomspace

array([ 2,  5,  8, 11, 14, 17])

array([10.        , 10.71428571, 11.42857143, 12.14285714, 12.85714286,
       13.57142857, 14.28571429, 15.        , 15.71428571, 16.42857143,
       17.14285714, 17.85714286, 18.57142857, 19.28571429, 20.        ])

In [182]:
# meshgrid
nx, ny = (3, 2)
x = np.linspace(0, 1, nx)
y = np.linspace(0, 1, ny)
xv, yv = np.meshgrid(x, y)
x, y
xv, yv

(array([0. , 0.5, 1. ]), array([0., 1.]))

(array([[0. , 0.5, 1. ],
        [0. , 0.5, 1. ]]),
 array([[0., 0., 0.],
        [1., 1., 1.]]))

In [186]:
x = np.zeros((3, 4, 5))
x
x.shape
np.moveaxis(x, 0, -1).shape  # move axis (in the shape dim vector)
np.moveaxis(x, -1, 0).shape

array([[[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

(3, 4, 5)

(4, 5, 3)

(5, 3, 4)

In [190]:
arr = np.array([[1, 2, 3], [4, 2, 5]])
arr
np.roll(arr, shift=1, axis=1)  # roll value along a given axis 

array([[1, 2, 3],
       [4, 2, 5]])

array([[3, 1, 2],
       [5, 4, 2]])

In [191]:
# change dimension

# atleast_1d, atleast_2d, atleast_3d - expand dimension; squeeze - remove dims with of length 1

In [194]:
np.array(5)
np.asscalar(np.array(5))

array(5)

5

In [206]:
arr = np.array([2, 1, 5])
arr.shape

arr_exp = np.expand_dims(arr, axis=0)
arr_exp
arr_exp.shape

arr_exp = np.expand_dims(arr, axis=1)
arr_exp
arr_exp.shape

(3,)

array([[2, 1, 5]])

(1, 3)

array([[2],
       [1],
       [5]])

(3, 1)

###  joining arrays

In [210]:
arr1 = np.array([1, 2, 4])
arr2 = np.array([4, 2, 7])

np.concatenate((arr1, arr2))  # concat
# np.concatenate((arr1, arr2), axis=1)  # won't work, axis 1 is not yet there
np.concatenate((np.expand_dims(arr1, 1), np.expand_dims(arr2, 1)), axis=0)  # have a function np.stack for this (!!)
np.concatenate((np.expand_dims(arr1, 1), np.expand_dims(arr2, 1)), axis=1)
np.concatenate((np.expand_dims(arr1, 0), np.expand_dims(arr2, 0)), axis=1)

array([1, 2, 4, 4, 2, 7])

array([[1],
       [2],
       [4],
       [4],
       [2],
       [7]])

array([[1, 4],
       [2, 2],
       [4, 7]])

array([[1, 2, 4, 4, 2, 7]])

In [225]:
np.stack([arr1, arr2], axis=0)
np.stack([arr1, arr2], axis=1)

arrays = [np.zeros((3, 4)) for _ in range(10)]
np.stack(arrays, axis=0).shape  # axis determines the number of the new axis
np.stack(arrays, axis=1).shape
np.stack(arrays, axis=2).shape

np.vstack([arr1, arr2])  # stack vertically (row-wise)
np.hstack([arr1, arr2])  # horizontally (column-wise))

np.column_stack((arr1, arr2, arr1))  # stack tuple of 1d arrays as columns

array([[1, 2, 4],
       [4, 2, 7]])

array([[1, 4],
       [2, 2],
       [4, 7]])

(10, 3, 4)

(3, 10, 4)

(3, 4, 10)

array([[1, 2, 4],
       [4, 2, 7]])

array([1, 2, 4, 4, 2, 7])

array([[1, 4, 1],
       [2, 2, 2],
       [4, 7, 4]])

array([[1, 4, 1],
       [2, 2, 2],
       [4, 7, 4]])

In [216]:
# create ndarray from nested list of blocks
A = np.eye(2) * 2
B = np.eye(3) * 3
np.block([
    [A, np.zeros((2, 3))],
    [np.ones((3, 2)), B ]
])

array([[2., 0., 0., 0., 0.],
       [0., 2., 0., 0., 0.],
       [1., 1., 3., 0., 0.],
       [1., 1., 0., 3., 0.],
       [1., 1., 0., 0., 3.]])

### splitting arrays

In [235]:
arr = np.arange(16).reshape(4, 4)
arr

np.split(arr, 2)  # if integer is passed - split into equal parts along given axis
np.split(arr, 2, axis=1)  
# np.split(arr, 3)  # won't work, need equal split


np.split(np.arange(10), [2, 4, 8])

np.array_split(arr, 3)  # same as np.split but doesn't require equal split

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

[array([[0, 1, 2, 3],
        [4, 5, 6, 7]]),
 array([[ 8,  9, 10, 11],
        [12, 13, 14, 15]])]

[array([[ 0,  1],
        [ 4,  5],
        [ 8,  9],
        [12, 13]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]])]

[array([0, 1]), array([2, 3]), array([4, 5, 6, 7]), array([8, 9])]

[array([[0, 1, 2, 3],
        [4, 5, 6, 7]]),
 array([[ 8,  9, 10, 11]]),
 array([[12, 13, 14, 15]])]

In [242]:
arr = np.array([1, 2, 3])

np.tile(arr, 3)  # repeat array

np.repeat(arr, 3, axis=0)  # repeat along axis
np.repeat(arr[:, np.newaxis], 3, axis=0)
np.repeat(arr[:, np.newaxis], 3, axis=1)

array([1, 2, 3, 1, 2, 3, 1, 2, 3])

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

array([[1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [3],
       [3],
       [3]])

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])

In [255]:
# insert/delete

arr = np.arange(9).reshape(3, 3)
arr

np.delete(arr, obj=1, axis=1)  # delete along axis, obj is int, array of ints or slice

np.insert(arr, obj=[0, 2], axis=1, values=np.arange(6).reshape(3, 2))  # insert along axis to positions, values must have correct shape

np.append(arr, axis=0, values=np.array([3, -2, 9])[np.newaxis, :])  # append along axis, values must have the same dim and correct shape

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[0, 2],
       [3, 5],
       [6, 8]])

array([[0, 0, 1, 1, 2],
       [2, 3, 4, 3, 5],
       [4, 6, 7, 5, 8]])

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 3, -2,  9]])

In [257]:
np.trim_zeros(np.array([0, 0, 0, 2, 3, 4, 0]))  # trim zeros for 1d array

array([2, 3, 4])

In [266]:
arr = np.array([[1, 2, 3], [1, 5, 8]])
arr

np.unique(arr)  # return unique elements, for multidim arrays - they will be flattened

np.flip(arr)  # flip along given axis
np.flip(arr, axis=1)

array([[1, 2, 3],
       [1, 5, 8]])

array([1, 2, 3, 5, 8])

array([[8, 5, 1],
       [3, 2, 1]])

array([[3, 2, 1],
       [8, 5, 1]])

In [270]:
arr = np.arange(9)
arr

arr.reshape(3, 3)  # 'C' order by default, row-based
arr.reshape(3, 3, order='F')  # Fortran column-based order

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[0, 3, 6],
       [1, 4, 7],
       [2, 5, 8]])

### functional programming

In [277]:
arr = np.arange(9).reshape(3, 3)
arr

func = lambda x: x/x.sum()  # must accept 1d arrays
np.apply_along_axis(func, arr=arr, axis=1)  # applied to 1d arrays along axis
np.apply_along_axis(func, arr=arr, axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[0.        , 0.33333333, 0.66666667],
       [0.25      , 0.33333333, 0.41666667],
       [0.28571429, 0.33333333, 0.38095238]])

array([[0.        , 0.08333333, 0.13333333],
       [0.33333333, 0.33333333, 0.33333333],
       [0.66666667, 0.58333333, 0.53333333]])

In [280]:
arr = np.arange(24).reshape(2, 3, 4)
arr

np.apply_over_axes(np.sum, arr, [0, 2])  # apply func sequentially over a sequence of axes

# np.vectorize - for vertorizing functions

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

array([[[ 60],
        [ 92],
        [124]]])

##  linear algebra

scipy.linalg is complementary to np.linalg (put functions from it in this section)

In [291]:
np.dot(np.array([1, 2, 3]), np.array([4, 3, 5]))

arr1 = np.arange(6).reshape(3, 2)
arr2 = np.arange(-3, 3).reshape(2, 3)
arr1
arr2

np.dot(arr1, arr2)  # matrix product for 2d arrays
np.dot(arr2, arr1)


np.outer(np.ones((5,)), np.linspace(-2, 2, 5))  # outer product of two arrays

25

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[-3, -2, -1],
       [ 0,  1,  2]])

array([[  0,   1,   2],
       [ -6,  -1,   4],
       [-12,  -3,   6]])

array([[ -8, -14],
       [ 10,  13]])

array([[-2., -1.,  0.,  1.,  2.],
       [-2., -1.,  0.,  1.,  2.],
       [-2., -1.,  0.,  1.,  2.],
       [-2., -1.,  0.,  1.,  2.],
       [-2., -1.,  0.,  1.,  2.]])

In [292]:
a = np.arange(60.).reshape(3,4,5)
b = np.arange(24.).reshape(4,3,2)
c = np.tensordot(a,b, axes=([1,0],[0,1]))  # tensor dot product along specified axes
c.shape

(5, 2)

In [294]:
arr = np.arange(9).reshape(3,3)
np.linalg.matrix_power(arr, 3)  # square matrix power

array([[ 180,  234,  288],
       [ 558,  720,  882],
       [ 936, 1206, 1476]])

In [296]:
np.kron([1,10,100], [5,6,7])  # kronecker product of two arrays

array([  5,   6,   7,  50,  60,  70, 500, 600, 700])

In [302]:
arr = np.arange(9).reshape(3,3)
herm_mx = arr.T.dot(arr)
herm_mx

# decompositions
np.linalg.cholesky(herm_mx)  # cholesky

np.linalg.qr(arr)  # QR decomposition

np.linalg.svd(arr)  # SVD

array([[45, 54, 63],
       [54, 66, 78],
       [63, 78, 93]])

array([[6.70820393e+00, 0.00000000e+00, 0.00000000e+00],
       [8.04984472e+00, 1.09544512e+00, 0.00000000e+00],
       [9.39148551e+00, 2.19089023e+00, 1.68587394e-07]])

(array([[ 0.        ,  0.91287093,  0.40824829],
        [-0.4472136 ,  0.36514837, -0.81649658],
        [-0.89442719, -0.18257419,  0.40824829]]),
 array([[-6.70820393e+00, -8.04984472e+00, -9.39148551e+00],
        [ 0.00000000e+00,  1.09544512e+00,  2.19089023e+00],
        [ 0.00000000e+00,  0.00000000e+00, -8.88178420e-16]]))

(array([[-0.13511895,  0.90281571,  0.40824829],
        [-0.49633514,  0.29493179, -0.81649658],
        [-0.85755134, -0.31295213,  0.40824829]]),
 array([1.42267074e+01, 1.26522599e+00, 5.89938022e-16]),
 array([[-0.4663281 , -0.57099079, -0.67565348],
        [-0.78477477, -0.08545673,  0.61386131],
        [-0.40824829,  0.81649658, -0.40824829]]))

In [304]:
np.linalg.eig(arr)  # eigenvalues and right eigenvectors

# also norm (for vector or matrix), det (determinant), matrix_rank

(array([ 1.33484692e+01, -1.34846923e+00, -2.48477279e-16]),
 array([[ 0.16476382,  0.79969966,  0.40824829],
        [ 0.50577448,  0.10420579, -0.81649658],
        [ 0.84678513, -0.59128809,  0.40824829]]))

In [317]:
a = np.arange(1, 10).reshape(3,3) + 3*np.eye(3)
b = np.array([1, 1, 5])

np.linalg.solve(a, b)  # solve linear system of equations

np.linalg.inv(a)  # inverse matrix

a[0, :] = 0
a
np.linalg.pinv(a)  # pseudo-inverse

array([-0.11111111, -0.36111111,  0.72222222])

array([[ 0.44444444,  0.        , -0.11111111],
       [-0.05555556,  0.25      , -0.11111111],
       [-0.22222222, -0.16666667,  0.22222222]])

array([[ 0.,  0.,  0.],
       [ 4.,  8.,  6.],
       [ 7.,  8., 12.]])

array([[-7.49974776e-18, -4.11522634e-02,  5.34979424e-02],
       [ 6.22133360e-17,  2.55144033e-01, -1.31687243e-01],
       [-3.05528295e-17, -1.46090535e-01,  1.39917695e-01]])

In [321]:
a
np.any(a, axis=0)  # any along axis
np.any(a, axis=1)
np.any(a)  # any for flattened array

array([[ 0.,  0.,  0.],
       [ 4.,  8.,  6.],
       [ 7.,  8., 12.]])

array([ True,  True,  True])

array([False,  True,  True])

True

In [322]:
arr = np.array([1, 2, np.nan, 0, -4, np.inf])

np.isnan(arr)
np.isinf(arr)
# same for isnat, isneginf, isposinf

array([False, False,  True, False, False, False])

array([False, False, False, False, False,  True])

In [331]:
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([1.1, 2, 3, 4.2])

np.allclose(arr1, arr2, atol=0.2)  # if all elements are close
np.allclose(arr1, arr2, rtol=1e-2)

np.isclose(arr1, arr2, atol=0.15)  # is close element-wise

np.array_equal(arr1, arr1)  # array equality (equal shape)

np.array_equiv(arr1, arr1.T)  # equivalence (consistent shape)

True

False

array([ True,  True,  True, False])

True

True

False

In [332]:
# trigonometric and hyperbolic functions are available
# exponents and logarithms
# arythmetic operations, mod, divmod, remainder

In [333]:
# rounding
# floor, ceil, round, trunc etc

In [339]:
arr = np.array([1, 2, 3, 4, np.nan, 3, 2, np.nan])

np.sum(arr)
np.nansum(arr)
np.nancumsum(arr)
# same for prod

arr = np.nan_to_num(arr, 0)
arr

np.diff(arr, 1)  # diff of order n along an axis
np.diff(arr, 2)

nan

15.0

array([ 1.,  3.,  6., 10., 10., 13., 15., 15.])

array([1., 2., 3., 4., 0., 3., 2., 0.])

array([ 1.,  1.,  1., -4.,  3., -1., -2.])

array([ 0.,  0., -5.,  7., -4., -1.])

### padding array

padding is adding values to the start/end of the array w.r.t a certain rule (or along axis for multidim array)

In [343]:
a = [1, 2, 3, 4, 5]
np.pad(a, (2, 3), 'constant', constant_values=(4, 6))
np.pad(a, (2, 3), 'edge')
np.pad(a, (2, 3), 'linear_ramp', end_values=(5, -4))
# many different padding modes available

array([4, 4, 1, 2, 3, 4, 5, 6, 6, 6])

array([1, 1, 1, 2, 3, 4, 5, 5, 5, 5])

array([ 5,  3,  1,  2,  3,  4,  5,  2, -1, -4])

In [None]:
# functions for working with polynomial available, coefficient of classical polynomial series (Chebyshev, Lagrange etc)

## numpy.random (random sampling; important changes since version 1.17!)

* BitGenerators are used to produce sequences of random numbers (typically unsigned ints)
* Generators are used to produce distribution samples from them

RandomState are outdated legacy routines and their use is **discouraged** (like np.random rand), they can use only one BitGenerator which Generators can use different onces  **(!!!)**

Generator can be used as a replacement for RandomState.

### why it's recommended not to use global state (like in RandomState); Han also suggested this for qp.checks - understand!

In [358]:
# do this
from numpy.random import default_rng  # default_rng - new instance of Generator
rng = default_rng()
vals = rng.exponential(scale=0.5, size=10)
more_vals = rng.exponential(scale=0.5, size=10)

# instead of this (!)
from numpy import random
vals = random.exponential(scale=0.5, size=10)
more_vals = random.exponential(scale=0.5, size=10)

In [360]:
rng.bit_generator  # getting BitGenerator of a Generator

<numpy.random._pcg64.PCG64 at 0x7f80d9184eb0>

In [361]:
# passing a specific BitGenerator with a seed
from numpy.random import Generator, PCG64
rg = Generator(PCG64(12345))
rg.standard_normal()

-1.4238250364546312

In [368]:
rg = default_rng(12345)  # PCG64 is used by default
rg.random()  # uniform [0, 1]

0.22733602246716966

In [374]:
# see documentation for improvements of the new framework with Generators over RandomState

In [383]:
rg = default_rng()

rg.integers(4, 10, size=3) # random int between low and high; replacement for np.random.randint
# size arg can be added anywhere to generate an iid sequence (we'll omit it)

rg.random()  # uniform on [0,1]
rg.choice([1, 2, 5, 6, 7, -10], 3, replace=False)  # choise with/without replacement

arr = np.arange(4)
rg.shuffle(arr)  # shuffle inplace
arr
rg.permutation(arr)  # random permutation

# and all the same functions for distribution generation that were previously in numpy.random, e.g.
rg.binomial(5, 0.3, size=10)
# etc

array([7, 7, 8])

0.21648641947252867

array([  5,   6, -10])

array([3, 1, 0, 2])

array([2, 0, 1, 3])

array([2, 1, 2, 2, 1, 2, 3, 3, 2, 2])

In [None]:
# in parallel applications it's preferred to use SeedSequence.spawn method
from numpy.random import Generator, Philox, SeedSequence
sg = SeedSequence(1234)
rg = [Generator(Philox(s)) for s in sg.spawn(10)]  # using Philox BitGenerator

### set routings

In [390]:
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([1, 3, 4, 6])

np.unique(arr1)  # unique values
np.intersect1d(arr1, arr2)  # set intersection
np.isin([5, 2], arr1)  # element-wise 'in' check
np.setdiff1d(arr1, arr2)  # set difference

array([1, 2, 3, 4])

array([1, 3, 4])

array([False,  True])

array([2])

### sorting

In [393]:
arr = np.array([3, 2, 4, 1])

np.sort(arr)  # sorting, axis arg - sort along axis or flatten if None, kind='quicksort', 'mergesort' etc

# create a structured array
dtype = [('name', 'S10'), ('height', float), ('age', int)]
values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38), ('Galahad', 1.7, 38)]
a = np.array(values, dtype=dtype) 
np.sort(a, order='height')  # specify order arg

array([1, 2, 3, 4])

array([(b'Galahad', 1.7, 38), (b'Arthur', 1.8, 41),
       (b'Lancelot', 1.9, 38)],
      dtype=[('name', 'S10'), ('height', '<f8'), ('age', '<i8')])

In [398]:
np.lexsort(([3, 4, 2], [3, 2, 2]))  # lexicographic sort using a sequency of keys (in order of priority, lowest to highest)

np.argsort(arr)  # indices that would sort the array

array([2, 1, 0])

array([3, 1, 0, 2])

In [407]:
# argmin, argmax, nanargmin, nanargmax

arr = np.array([2, 3, 4, 0, 1])
np.argwhere(arr)  # indices with non-zero values

np.extract(arr>2, arr)  # extract values where condition holds
np.where(arr>2, arr, np.nan)  # condition, array to choose when True and other array (or value) to choose when False

np.count_nonzero(arr)

array([[0],
       [1],
       [2],
       [4]])

array([3, 4])

array([nan,  3.,  4., nan, nan])

4

In [416]:
arr = np.array([1, 3, 4, 5, 6, 8, 12, 34])

np.percentile(arr, 3)  # percentile, can pass axis and interpolation type

np.quantile(arr, 0.3)  # quantile, axis and interpolation type (linear by default)
np.quantile(arr, 0.3, interpolation='lower')
np.quantile(np.arange(9).reshape(3,3), 0.8, axis=1)  # along axis
# nanquantile, nanpercentile

1.42

4.1

4

array([1.6, 4.6, 7.6])

In [425]:
arr1 = np.array([2, -1, 4, -3, 8])
arr2 = np.array([-4, -2, 6, 7, 3])

np.corrcoef(arr1, arr2)  # correl matrix
np.correlate(arr1, arr2)  # dot product (not correlation between -1 and 1 !!)

np.cov(arr1, arr2)  # covariance matrix 

array([[1.        , 0.01199003],
       [0.01199003, 1.        ]])

array([21])

array([[18.5 ,  0.25],
       [ 0.25, 23.5 ]])

## check other function (convolve, histograms, bincount) -- together with checking scipy!

In [427]:
# useful test functions (see docs)

# assert_allclose
# assert_almost_equal

## explore numpy dispatch

In [1]:
import numpy as np 

In [2]:
class MyArray(np.ndarray):
    def __init__

SyntaxError: unexpected EOF while parsing (<ipython-input-2-c03f32997c28>, line 1)