In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np

In [7]:
# main type ndarray

arr = np.array([1, 2])
type(arr)
isinstance(arr, np.ndarray)

numpy.ndarray

True

floating data representation : matissa + exponent

e.g. ```6.573*10^5``` - mantissa is 6.573 and exponent is 5

* single precision: 32 bits overall, 23 bits for mantissa, 8 bits for exponent
* double precision: 64 bits overall, 52 bins for mantissa, 11 bits for exponent

double precision should be use in most numerical computations (types like np.float64)

* ndarray - collection of items of the same type
* indexed by integers
* homogeneous - all block are interpreted in exactly the same way

* each object interpretation - specified by *data-type object*
* array elements can be built-in and custom structures and classes

In [14]:
arr = np.array([[1, 2], [2, 5]], dtype=np.int32)  # can construct using a dtype
arr.dtype
arr.shape

# usual indexing and slicing works
arr[1, 1]
arr[(1, 1)]  # can index with a tuple

# ndarray is mutable
arr[0,1] = -18
arr

dtype('int32')

(2, 2)

5

5

array([[  1, -18],
       [  2,   5]], dtype=int32)

In [20]:
np.ndarray((3, 5), dtype=np.float64)  # constructor, filled with some garbage

array([[4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323, 2.5e-323],
       [3.0e-323, 3.5e-323, 4.0e-323, 4.4e-323, 4.9e-323],
       [5.4e-323, 5.9e-323, 6.4e-323, 6.9e-323, 7.4e-323]])

Numpy keeps data in a single sequential array in memory. Multidimensional structure can be based on row-major (C-style) or column major (F-style) order

Array stored in a single sequence also called contiguous array (C-contiguous, F-contiguous)

In [32]:
np.ndarray((2, 2), buffer=np.array([1, 2, 3, 4]), dtype=int)  # when passing a buffer need to specify dtype ! to decode correctly

np.ndarray((2, 2), buffer=np.array([1, 2, 3, 4]), dtype=int, order='F')  # Fstyle order (column based)

array([[1, 2],
       [3, 4]])

array([[1, 3],
       [2, 4]])

### other constructors

In [46]:
np.ones((4, 5), dtype=bool)
np.ones((4, 5), dtype=np.float64)

np.zeros((4, 3), dtype=int)

np.full(shape=(2,3), fill_value='hello')

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

array([['hello', 'hello', 'hello'],
       ['hello', 'hello', 'hello']], dtype='<U5')

## Methods

a lot of methods exists as array object methods and as np functions (like np.any(arr) or arr.any())

In [47]:
arr = np.array([1, 3, 4, 6, 2, 5, 3, 4, 7, 6, 4])

In [50]:
arr.any()  # if any element is True/non-zero
np.zeros(4).any()

True

False

In [52]:
# argmax, argmin

arr.argsort()  # returns indices that would sort the array

array([ 0,  4,  1,  6,  2,  7, 10,  5,  3,  9,  8])

In [54]:
# casting

arr.astype(float)

arr.astype(str)

array([1., 3., 4., 6., 2., 5., 3., 4., 7., 6., 4.])

array(['1', '3', '4', '6', '2', '5', '3', '4', '7', '6', '4'],
      dtype='<U21')

### Endianness

The smallest data group with an address is eight bits long and is called a byte. Larger groups comprise two or more bytes, for example a 32-bit word contains four bytes. There are two possible ways a computer could number the individual bytes in a larger group, starting at either end, and over the course of computer history, both methods have been used, leading to occasional problems.

Internally, any given computer will work equally well regardless of what endianness it uses, since its hardware will consistently use the same endianness to both store and load its data. For this reason, programmers and computer users normally ignore the endianness of the computer they are working with. 

However, endianness can become an issue when moving data external to the computer – as when transmitting data between different computers, or a programmer investigating internal computer bytes of data from a memory dump – and the endianness used differs from expectation. In these cases, the endianness of the data must be understood and accounted for.

In [59]:
arr.byteswap()  # toggle between big-endian and little-endian
# on any given computer one part will be a mess, another one - true numbers; but for external data may need to swap byte order to get real values
arr.byteswap().byteswap()

array([ 72057594037927936, 216172782113783808, 288230376151711744,
       432345564227567616, 144115188075855872, 360287970189639680,
       216172782113783808, 288230376151711744, 504403158265495552,
       432345564227567616, 288230376151711744])

array([1, 3, 4, 6, 2, 5, 3, 4, 7, 6, 4])

In [66]:
np.choose([1, 0, 1], [[2, 3, 3], [8, 4, -2]])  # choose 1st, 2nd, 3rd etc elements from different arrays for which the numbers are defined in the first index arg

# clip

array([ 8,  3, -2])

In [81]:
np.arange(6).reshape(3, 2, order='F')
arr = np.arange(6).reshape(3, 2)  # order is 'C' by default
arr

arr.flatten()
arr.flatten(order='F')


arr.compress([True, False, True], axis=0)  # apply bool array mask along given axis

array([[0, 3],
       [1, 4],
       [2, 5]])

array([[0, 1],
       [2, 3],
       [4, 5]])

array([0, 1, 2, 3, 4, 5])

array([0, 2, 4, 1, 3, 5])

array([[0, 1],
       [4, 5]])

In [79]:
# copy, cumsum, cumprod
arr.cumsum()  # will flatten the array if no dim specified
np.cumsum(arr, axis=0)

arr.f

array([ 0,  1,  3,  6, 10, 15])

array([[0, 1],
       [2, 4],
       [6, 9]])

array([[0, 1],
       [2, 3],
       [4, 5]])

In [128]:
arr = np.arange(12).reshape(4, 3)
arr

arr.diagonal()  # one of the matrix diagonals
arr.diagonal(offset=1)
arr.diagonal(offset=-1)

arr.trace(offset=-1)  # same rules but calculates the diagonal value sum (trace)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

array([0, 4, 8])

array([1, 5])

array([ 3,  7, 11])

21

In [103]:
np.arange(4).dot(np.array([1, 1, -1, 1]))  # array dot-product

arr.T.dot(np.array([1, 1, -1, 1]))  # matrix-vector dot-product

arr = np.arange(4)
arr.shape  # one-dim array - cannot transpose
arr.reshape(1, 4)  # need to create 2d array, then can transpose and use in matrix operations
# 1d array can be used in matrix products if the required shape is (len(arr), 1)

2

2

(4,)

array([[0, 1, 2, 3]])

In [None]:
# np.dump(file), np.load(file) - pickle for np arrays

# min, max, mean, nonzero, std, sum, squeeze, var

In [107]:
# ravel vs flatten : both functions for flattening the array

arr = np.arange(6).reshape(3, 2)
arr

arr_flat = arr.flatten()  # returns a copy
arr_flat[0] = -18
arr

arr_flat = arr.ravel()  # return a view of the original array where possible
arr_flat[0] = -18
arr

# representations of the original arrays are called views (i.e. if we change smth in the view - we'll see the change in the original array)

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[-18,   1],
       [  2,   3],
       [  4,   5]])

In [111]:
np.repeat(np.arange(3), 5)

# np.repeat(np.arange(3), 5, axis=1)  # not possible
np.repeat(np.arange(3).reshape(3, 1), 5, axis=1)  # ok after creating a 2d array

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

array([[0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2]])

In [112]:
np.round([2.43, 1.45], 1)

array([2.4, 1.4])

In [114]:
arr
arr.flags  # some useful info; can also set flags using .setflags(..)

array([[-18,   1],
       [  2,   3],
       [  4,   5]])

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [117]:
# sort

a = np.array([[1,4],[3,1]])
np.sort(a)                # sort along the last axis

np.sort(a, axis=None)     # sort the flattened array

np.sort(a, axis=0)        # sort along the first axis

array([[1, 4],
       [1, 3]])

array([1, 1, 3, 4])

array([[1, 1],
       [3, 4]])

# explore dtype with field names !! (structured array?)

In [123]:
arr = np.arange(6).reshape(3, 2)
arr.take([(1, 2), (0, 2), (1,0)])  # select values for a groups of indices

np.arange(1, 5)[[1, 2]]  # simpler approach will work for 1d array 

array([[1, 2],
       [0, 2],
       [1, 0]])

array([2, 3])

In [124]:
arr.tobytes()  # convert to binary string

b'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00'

In [127]:
arr.tolist()

[[0, 1], [2, 3], [4, 5]]

In [135]:
arr
arr.swapaxes(1, 0)  # same as transpose

arr3d = np.arange(27).reshape(3, 3, 3)
arr3d
arr3d.swapaxes(0, 2)  # swap any pair of axes

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

array([[ 0,  3,  6,  9],
       [ 1,  4,  7, 10],
       [ 2,  5,  8, 11]])

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

array([[[ 0,  9, 18],
        [ 3, 12, 21],
        [ 6, 15, 24]],

       [[ 1, 10, 19],
        [ 4, 13, 22],
        [ 7, 16, 25]],

       [[ 2, 11, 20],
        [ 5, 14, 23],
        [ 8, 17, 26]]])

In [153]:
arr = np.array([[1, 2, 3], [5, 2, 6]], dtype=np.int64)
arr

# generating new data views of the same array (the same bytes in memory interpreted for a different dtype, not type casting!)
arr.view(dtype=bool)
arr.view(dtype=float)
arr.view(dtype=np.int32)

mx = arr.view(type=np.matrix)  # can also pass numpy data type (e.g. view arr as matrix)
mx
mx[0, 0] = -20  # it's a view, so points to the same data, and can change it
arr

array([[1, 2, 3],
       [5, 2, 6]])

array([[ True, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False],
       [ True, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False]])

array([[4.9e-324, 9.9e-324, 1.5e-323],
       [2.5e-323, 9.9e-324, 3.0e-323]])

array([[1, 0, 2, 0, 3, 0],
       [5, 0, 2, 0, 6, 0]], dtype=int32)

matrix([[1, 2, 3],
        [5, 2, 6]])

array([[-20,   2,   3],
       [  5,   2,   6]])

In [156]:
# attributes
arr

arr.nbytes
arr.shape
arr.ndim
arr.size
arr.data

array([[-20,   2,   3],
       [  5,   2,   6]])

48

(2, 3)

2

6

<memory at 0x7f1eee5ecad0>

In [162]:
arr1 = np.array([2, 1, 4])
arr2 = np.array([3, 0, 3])

arr1 <= arr2  # same shape comparison
arr1 * arr2  # component-wise arythmetic operations

array([ True, False, False])

array([ 6,  0, 12])

In [179]:
# scalar arrays
scalar_arr = np.array(5)
scalar_arr

scalar_arr.shape
scalar_arr.sum()

array(5)

()

5

### array scalars

In [194]:
isinstance(4, np.int64)
isinstance(np.int64(4), np.int64)
isinstance(np.array(4), np.int64)

False

True

False

In [205]:
isinstance(4, np.object)

True

In [211]:
# numpy has a complicated hierarchy of types
# it supports native python types : np.int, np.float, np.object, np.bool etc
# but has many more type with extra info : like np.int32, np.int64, np.uint8 etc

x = np.int64(3)
type(x)
isinstance(x, np.int)
isinstance(x, np.integer)  # do not mix int (python built-in) and integer

x = 3
type(x)
# from python output it's not clear what kind of e.g. int we have

numpy.int64

False

True

int

In [213]:
# numpy supports scalars, and the scalar type hierarchy mimics (almost) the dtype hieararchy

x = np.array([1, 2, 3], dtype=np.int64)
type(x[1])
type(np.array(x[1]))  # scalar array is different though from numpy scalar

numpy.int64

numpy.ndarray

In [222]:
# numpy scalars have a lot of various method and attributes (support all of ndarray methods, though a lot are not implemented in numpy.generic - base class for all scalar types)

# all numpy scalars are inherited from numpy.generic that has all the same methods as ndarray
# but a lot of them don't make sense for scalars and are not implemented

np.int64(4).flags
np.int64(4).ndim
np.int64(4).sum()

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : False
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

0

4

True

In [221]:
x = np.int32(5)
isinstance(x, np.generic)

True

#### definining new type

* can subclass ndarray and define a method of interest : but this will only work to some extent; a lot of the things that work in ndarray are based on the check that this is ndarray, so some functionality will be lost
* the best way to define it in C using numpy C-API
* often inheritance is probably not needed, potentially a function or a new class which has ndarray as a data field (composition) will work well

### data types

* np.dtype - data type objects
* describe the type, size, byteorder (little/big endian), data element own features if it's a structured object
* a numpy scalar type is associated with each data type, any element extracted from ndarray via indexing will have this data type

**np.dtype objects are not the same as scalar type objects, but can used instead of one another when creating arrays**

In [235]:
np.dtype('<i4') == np.int32  # dtype and the corresponding scalar type

dt = np.dtype('>i4')  # > specifies big-endian byteorder
dt.byteorder
dt.name

isinstance('abs', np.unicode)  # unicode scalar type
np.array('abs').dtype  # unicode dtype '<U3' - 3 chars

True

'>'

'int32'

True

dtype('<U3')

### structured data type

In [248]:
# dtype can be specified by dict-like list of tuples
dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
dt
dt['name']
dt['grades']

dtype([('name', '<U16'), ('grades', '<f8', (2,))])

dtype('<U16')

dtype(('<f8', (2,)))

In [249]:
# can use structured data type for arrays with elements of mixed data
x = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
x

x['name']  # names used in the dtype structure can be used to access numpy array subarrays (just like in pandas)
x['grades']
x[1]
x[0]['name']

array([('Sarah', [8., 7.]), ('John', [6., 7.])],
      dtype=[('name', '<U16'), ('grades', '<f8', (2,))])

array(['Sarah', 'John'], dtype='<U16')

array([[8., 7.],
       [6., 7.]])

('John', [6., 7.])

'Sarah'

In [255]:
# np.dtype is a dtype constructor, can accept python types, list for of tuples (name, scalar type), type strings etc
np.dtype(int)
np.dtype(np.int64)
np.dtype('f8')  # type string

# dtype also has functions to change byteorder

dtype('int64')

dtype('int64')

dtype('float64')

In [277]:
dt = np.dtype(int)

# dtype object attributes
dt.str
dt.name
dt.num  # unique id number (for each dtype)
dt.type
dt.itemsize

'<i8'

'int64'

7

numpy.int64

8

In [280]:
# for structured dtypes
dt = np.dtype([('name', '<U24'), ('score', np.int64)])
dt

dt.fields  # returns a dict
dt.names

dtype([('name', '<U24'), ('score', '<i8')])

mappingproxy({'name': (dtype('<U24'), 0), 'score': (dtype('int64'), 96)})

('name', 'score')

In [265]:
dt.kind

# various dtype kinds (may be useful)
print("""
b boolean
i signed integer
u unsigned integer
f floating-point
c complex floating-point
m timedelta
M datetime
O object
S (byte-)string
U Unicode
V void
""")

'i'


b boolean
i signed integer
u unsigned integer
f floating-point
c complex floating-point
m timedelta
M datetime
O object
S (byte-)string
U Unicode
V void



In [266]:
x = np.array([1, 2, 3])
x.dtype.kind == 'i'

True

**numbers** module - hierarchy of abstract numerical classes (not of which can be instantiated)

In [272]:
from numbers import Number, Complex, Real, Rational, Integral

isinstance(np.int64(3), Integral)
isinstance(np.float64(3), Integral)
isinstance(np.float64(3), Real)
isinstance(np.float64(3), Rational)

True

False

True

False

In [275]:
# can use issubclass to check if numbers types encompass numpy types
issubclass(np.int64, Integral)
issubclass(np.float16, Real)

True

True

In [284]:
dt = np.dtype(('i4', (2, 3)))
dt.shape  # shape and subdtype for structured dtypes
dt.subdtype

(2, 3)

(dtype('int32'), (2, 3))

### indexing

In python in general ```x[i1, i2, i3]``` is equivalent to ```x[(i1, i2, i3)]```

In [287]:
# basic indexing

arr = np.arange(12)
arr[2:9:2]  # slicing with step

array([2, 4, 6, 8])

In [291]:
arr_new = arr[1:5, np.newaxis]  # np.newaxis slice is used to expand the dimension
arr_new
arr_new.shape, arr.shape

array([[1],
       [2],
       [3],
       [4]])

((4, 1), (12,))

In [294]:
arr = np.array([[1, 2, 3], [5, 3, 6]])
arr[1, :]  # ndim is reduced by one
arr[1:2, :]  # same ndim

array([5, 3, 6])

array([[5, 3, 6]])

In [299]:
# arr[(1, 2)] - basic indexing
# arr[(1, 2),] - will triger advanced indexing, arr[[1, 2]] is also advanced indexing

# integer and boolean advanced indexing

# integer indexing 
x = np.array([[1, 2], [3, 4], [5, 6]])
x[[0, 1, 2], [0, 1, 0]]  # select using row and columns indices
x[(0, 1)]  # to basic
x[[0, 1]]  # to advanced (starts from first dimensions)

array([1, 4, 5])

2

array([[1, 2],
       [3, 4]])

In [300]:
# boolean indexing
arr = np.arange(8)
arr[arr < 6]

# behaviour is similar to passing bool_obj.nonzero() integer index

array([0, 1, 2, 3, 4, 5])

In [301]:
(arr<6).nonzero()

(array([0, 1, 2, 3, 4, 5]),)

In [304]:
# structured array indexing

arr = np.array([[1, 4, 2], [2, 5, 4], [1, 2, 6]], dtype=np.dtype([('a', '<i8'), ('b', '<i8'), ('c', '<i8')]))
arr

array([[(1, 1, 1), (4, 4, 4), (2, 2, 2)],
       [(2, 2, 2), (5, 5, 5), (4, 4, 4)],
       [(1, 1, 1), (2, 2, 2), (6, 6, 6)]],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])

In [355]:
# to create structured array : need to pass array of tuples, and specify each tuple entry dtype in the dtype constructor (so structured array fields won't correspond to e.g. indiv. columns as in pandas)
dt = np.dtype([('a', 'i8'), ('b', 'i8'), ('c', 'i8')])
arr = np.array([(1, 4, 2), (2, 5, 4), (1, 2, 6)], dtype=dt)
arr
arr.shape
arr['b']  # we can access indiv. field dict-like (select single or several tuple entries from each tuple item)
arr[['a', 'c']]
arr['b'][1]

arr.astype(np.object)  # remove specification of the entry (leave general object)

array([(1, 4, 2), (2, 5, 4), (1, 2, 6)],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])

(3,)

array([4, 5, 2])

array([(1, 2), (2, 4), (1, 6)],
      dtype={'names':['a','c'], 'formats':['<i8','<i8'], 'offsets':[0,16], 'itemsize':24})

5

array([(1, 4, 2), (2, 5, 4), (1, 2, 6)], dtype=object)

In [333]:
arr = np.arange(9).reshape(3, 3)
arr

flat_it = arr.flat  # returns a flattened iterator
flat_it
list(flat_it)

np.nditer(arr)  # provides iterator, but over elements as scalar arrays

# can use nditer and ndararys with various specifications inside Cython code to increase performance (along with using numba or C-extension)
# check numpy C-API (!) for writing C-extensions

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

<numpy.flatiter at 0x5589ba656bd0>

[0, 1, 2, 3, 4, 5, 6, 7, 8]

<numpy.nditer at 0x7f1eee58d210>

In [341]:
# default internal loop goes through elements one by one
# passing external_loop flag with allow numpy to use vectorized code and process larger chunks of data
np.nditer(arr, flags=['external_loop'])

# it's still likely better to use numba, Cython or C-extension for time-critical loops

<numpy.nditer at 0x7f1eee58d8f0>

### It is not recommended to subclass ndarrays (!) Instead numpy dispatch mechanism should be used

### It is recommended not to use np.matrix subclass, it's used mostly for interacting with scipy.sparse, and eventually will be removed in the future  (even for linear algebra regular arrays are recommended)

### Chararray is not recommended to new development (exists for compatibility)

In [348]:
#  memory map
# used to map binary file to np.ndarray and access parts of the data without loading the whole file
# (useful for big files)

data = np.arange(12, dtype='float32')
data.resize((3,4))

from tempfile import mkdtemp  # useful for making temp dirs
from pathlib import Path
filename = Path(mkdtemp())/'newfile.dat'

# create memory map
fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))

# assign data to it
fp[:] = data[:]

del fp  # deletion flushed memory to disk (could also use .flush())

In [349]:
# can now read the data
newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))  # can specify filemode (r, rw, r+ etc)
newfp[1, 3]

7.0

### record arrays

array that allows field access using attributes

In [353]:
np.recarray  # ndarray subclass
np.record  # corresponding scalar class

numpy.recarray

numpy.record

In [360]:
# standard array creation with structured dtype
arr = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', '<f8'), ('y', '<i8')])
arr

arr['x']
#arr.x  # won't work

arr = arr.view(np.recarray)  # creating record array
arr.x

array([(1., 2), (3., 4)], dtype=[('x', '<f8'), ('y', '<i8')])

array([1., 3.])

array([1., 3.])

In [383]:
# recarray using constructor (names and formats arguments, and an array of tuples)
rec_arr = np.recarray(names=['a', 'b'], formats=['i8', 'f8'], 
        buf=np.array([(5, 4), (2, 6), (1, 2)]), shape=(3,))
rec_arr
rec_arr['a']


# recarray form 2D array (!) - interprets rows as corresponding to the recarray fields
arr = np.arange(6).reshape(2, 3)
rec_arr = np.core.records.fromarrays(arr, names=['a', 'b'], formats=['i8', 'f8'])
type(rec_arr)
rec_arr['b']
rec_arr['a']
rec_arr.dtype

# recarray has similar method for ndarray

rec.array([(5, 2.e-323), (2, 3.e-323), (1, 1.e-323)],
          dtype=[('a', '<i8'), ('b', '<f8')])

array([5, 2, 1])

numpy.recarray

array([3., 4., 5.])

array([0, 1, 2])

dtype((numpy.record, [('a', '<i8'), ('b', '<f8')]))

### masked arrays

```np.ma``` module - the same functionality as ndarray plus working with masks and missing data

In [390]:
arr = np.array([1, 2, 3, -1, 4])
mx = np.ma.masked_array(arr, mask=[0,0,0,1,0])  # 1 or True at the invalid values location
mx

mx.mean()  # mean without the invalid value

type(mx)  # MaskedArray - subclass of ndarray

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=999999)

2.5

numpy.ma.core.MaskedArray

In [392]:
mx = np.ma.masked_array(arr, mask=[0,0,0,1,0], fill_value=12)  # get add a fill_value
mx

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=12)

In [398]:
mx
mx.astype(float)  # standard ndarray methods work

masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=12)

masked_array(data=[1.0, 2.0, 3.0, --, 4.0],
             mask=[False, False, False,  True, False],
       fill_value=12.0)

In [410]:
arr = np.arange(10)
arr_masked = np.ma.masked_where(arr > 5, arr)  # mask using a condition
arr_masked

arr_masked.mask  # accessing the mask
arr_masked.compressed()  # get only valid values

masked_array(data=[0, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[False, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

array([False, False, False, False, False, False,  True,  True,  True,
        True])

array([0, 1, 2, 3, 4, 5])

In [412]:
arr_masked
arr_masked[0] = np.ma.masked  # masking another value(s); assign any value to unmask instead
arr_masked

masked_array(data=[0, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[False, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

masked_array(data=[--, 1, 2, 3, 4, 5, --, --, --, --],
             mask=[ True, False, False, False, False, False,  True,  True,
                    True,  True],
       fill_value=999999)

In [414]:
# can perform some math operations on masked arrays
x = np.array([2, -1, 3, 4])
np.log(x)  # produced nan
np.ma.log(x)  # produces masked array

array([0.69314718,        nan, 1.09861229, 1.38629436])

masked_array(data=[0.6931471805599453, --, 1.0986122886681098,
                   1.3862943611198906],
             mask=[False,  True, False, False],
       fill_value=1e+20)

In [416]:
arr = np.array([1, 2, 3, -1, 2, -4])
arr_masked = np.ma.masked_values(arr, -1)  # masking certain value
arr_masked

arr_masked.filled(10)  # filling missing value

masked_array(data=[1, 2, 3, --, 2, -4],
             mask=[False, False, False,  True, False, False],
       fill_value=-1)

array([ 1,  2,  3, 10,  2, -4])

In [418]:
arr_masked
arr_masked[3] == np.ma.masked

masked_array(data=[1, 2, 3, --, 2, -4],
             mask=[False, False, False,  True, False, False],
       fill_value=-1)

masked

In [422]:
is_equal = arr_masked == arr_masked.copy()  # comparison keeps the mask structure
is_equal

is_equal.all()  # correct values

# compare with 
(np.array([1, 3, np.nan]) == np.array([1, 3, np.nan])).all()

masked_array(data=[True, True, True, --, True, True],
             mask=[False, False, False,  True, False, False],
       fill_value=True)

True

False

In [427]:
arr_nans = np.array([1, 2, 4, np.nan, 2])
arr_masked = np.ma.masked_invalid(arr_nans)  # array with nans to masked array
arr_masked

masked_array(data=[1.0, 2.0, 4.0, --, 2.0],
             mask=[False, False, False,  True, False],
       fill_value=1e+20)