# 4.1 The NumPy ndarray: A Multidimesional Array Object

In [1]:
import numpy as np

In [2]:
data = np.array([[1.5, -0.1, 3], [0, -3, 6.5]])
data

array([[ 1.5, -0.1,  3. ],
       [ 0. , -3. ,  6.5]])

In [3]:
#write mathematical operations with data

data * 10

array([[ 15.,  -1.,  30.],
       [  0., -30.,  65.]])

In [4]:
data + data

array([[ 3. , -0.2,  6. ],
       [ 0. , -6. , 13. ]])

In [5]:
#find dimension(shape) of data

data.shape

(2, 3)

In [7]:
#find data type of 'data'

data.dtype

dtype('float64')

In [8]:
#Creating ndarrays

data1 = [6, 7.5, 8, 0, 1]

arr1 = np.array(data1)
arr1

array([6. , 7.5, 8. , 0. , 1. ])

In [9]:
#Nested sequences will be converted into a multidimensional array

data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]

arr2= np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [11]:
#finding data tupe of ndarray

arr1.dtype

dtype('float64')

In [12]:
arr2.dtype

dtype('int32')

In [13]:
#Inspecting with (ndim) and (shape)

arr2.ndim

2

In [14]:
arr2.shape

(2, 4)

In [15]:
#numpy.zeros --create array of zeros// numpy.ones --create array of ones

np.zeros(5)

array([0., 0., 0., 0., 0.])

In [17]:
np.ones((3,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [19]:
#numpy.empty creates an array without initializing values to any particular value

np.empty((2,3,3))

array([[[1.37962049e-306, 1.24610791e-306, 1.11260959e-306],
        [1.69109959e-306, 9.34603679e-307, 1.42419802e-306],
        [1.78019082e-306, 4.45061456e-308, 1.24612081e-306]],

       [[1.37962049e-306, 9.34597567e-307, 1.29061821e-306],
        [1.78019625e-306, 1.11255866e-306, 8.90098127e-307],
        [9.34609790e-307, 3.91792279e-317, 1.24610587e-306]]])

In [21]:
#numpy.arange is an array-valued version of the built-in Python 'range' function

np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
#setting datatypes

arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)

In [22]:
#explicitly convert or cast an array from one data type to another using 'astype'

arr3 = np.array([7,8,9,10])

arr3.dtype

dtype('int32')

In [24]:
float_arr3 = arr3.astype(np.float64)
float_arr3

array([ 7.,  8.,  9., 10.])

In [26]:
float_arr3.dtype

dtype('float64')

In [27]:
#If you have an array of strings representing numbers, you can use astype to convert them to numeric form

numeric_strings = np.array(["1.25", "-9.6", "42"], dtype=np.string_)
numeric_strings.astype(float)

array([ 1.25, -9.6 , 42.  ])

In [28]:
#Arithmetic with NumPy Arrays

arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [29]:
arr**2

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [30]:
1/ arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [32]:
#Comparisons between arrays of the same size yield Boolean arrays

arr3 > arr2

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [33]:
#Basic Indexing and Slicing

arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [35]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [36]:
arr[5:8]

array([5, 6, 7])

In [39]:
arr[5:8] = 100
arr

array([  0,   1,   2,   3,   4, 100, 100, 100,   8,   9])

In [40]:
arr_slice = arr[5:8]
arr_slice

array([100, 100, 100])

In [41]:
arr_slice[0] = 99
arr

array([  0,   1,   2,   3,   4,  99, 100, 100,   8,   9])

In [42]:
#Boolean Indexing

names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])

data = np.array(
    [[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]]
)

In [43]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [44]:
data

array([[  4,   7],
       [  0,   2],
       [ -5,   6],
       [  0,   0],
       [  1,   2],
       [-12,  -4],
       [  3,   4]])

In [45]:
names == "Bob"

array([ True, False, False,  True, False, False, False])

In [46]:
#The Boolean array must be of the same length as the array axis it’s indexing

data[names == "Bob"]

array([[4, 7],
       [0, 0]])

In [2]:
#Fancy Indexing

arr = np.zeros((8,4))

for i in range(8):
    arr[i] = i
    
arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [3]:
arr[[4,3,0,6]]

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.],
       [0., 0., 0., 0.],
       [6., 6., 6., 6.]])

In [5]:
arr = np.arange(32).reshape((8,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [6]:
arr [[1,5,7,2], [0,3,1,2]]

array([ 4, 23, 29, 10])

In [7]:
arr[[1,5,7,2]][:, [0,3,1,2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [8]:
arr[[1,5,7,2], [0,3,1,2]]

array([ 4, 23, 29, 10])

In [9]:
arr[[1,5,7,2], [0,3,1,2]] = 0
arr

array([[ 0,  1,  2,  3],
       [ 0,  5,  6,  7],
       [ 8,  9,  0, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22,  0],
       [24, 25, 26, 27],
       [28,  0, 30, 31]])

In [10]:
# Transposing Arrays and Swapping Axes

arr = np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [11]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [13]:
# When doing matrix computations, you may do this very often—for example, 
# when computing the inner matrix product using numpy.dot() 

arr = np.array([[0, 1, 0], [1, 2, -2], [6, 3, 2], [-1, 0, -1], [1, 0, 1]])
arr

array([[ 0,  1,  0],
       [ 1,  2, -2],
       [ 6,  3,  2],
       [-1,  0, -1],
       [ 1,  0,  1]])

In [15]:
np.dot(arr.T, arr)

array([[39, 20, 12],
       [20, 14,  2],
       [12,  2, 10]])

In [16]:
# The @ infix operator is another way to do matrix multiplication:

arr.T @ arr

array([[39, 20, 12],
       [20, 14,  2],
       [12,  2, 10]])

In [17]:
#Simple transposing with .T is a special case of swapping axes. ndarray has the method
#swapaxes, which takes a pair of axis numbers and switches the indicated axes to
#rearrange the data:

arr.swapaxes(0,1)

array([[ 0,  1,  6, -1,  1],
       [ 1,  2,  3,  0,  0],
       [ 0, -2,  2, -1,  1]])

# 4.2 Pseudorandom Number Generation

In [19]:
samples = np.random.standard_normal(size = (4,4))
samples

array([[ 0.00235529, -0.13151508,  1.10610418,  0.79088196],
       [-0.26184629,  0.57371415, -0.07473965,  0.69473805],
       [-0.03106983, -0.88123842, -0.24324499, -0.24656793],
       [-1.57626518,  0.80033656,  1.14054567,  0.48713989]])

Python’s built-in random module, by contrast, samples only one value at a time. As
you can see from this benchmark, numpy.random() is well over an order of magnitude
faster for generating very large samples:


In [None]:
from random import normalvariate 

N = 1_000_000

%timeit samples = [normalvariate(0, 1) for _ in range(N)]
1.04 s +- 11.4 ms per loop (mean +- std. dev. of 7 runs, 1 loop each)

%timeit np.random.standard_normal(N)
21.9 ms +- 155 us per loop (mean +- std. dev. of 7 runs, 10 loops each)

These random numbers are not truly random (rather, pseudorandom) but instead
are generated by a configurable random number generator that determines determin‐
istically what values are created. Functions like numpy.random.standard_normal use
the numpy.random module’s default random number generator, but your code can be
configured to use an explicit generator:

In [22]:
rng = np.random.default_rng(seed=12345)

data = rng.standard_normal((2, 3))

The seed argument is what determines the initial state of the generator, and the state
changes each time the 'rng' object is used to generate data. The generator object rng is
also isolated from other code which might use the numpy.random() module:

In [23]:
type(rng)

numpy.random._generator.Generator

# 4.3 Universal Functions: Fast Element-Wise Array Functions

A universal function, or ufunc, is a function that performs element-wise operations
on data in ndarrays. You can think of them as fast vectorized wrappers for simple
functions that take one or more scalar values and produce one or more scalar results.

In [25]:
# Many ufuncs are simple element-wise transformations, like numpy.sqrt() or numpy.exp()

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [26]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [27]:
np.exp(arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

In [28]:
# These are referred to as unary ufuncs. Others, such as numpy.add() or numpy.maximum(),
# take two arrays (thus, binary ufuncs) and return a single array as the result:

x= rng.standard_normal(8)

y = rng.standard_normal(8)

In [29]:
x

array([-1.3677927 ,  0.6488928 ,  0.36105811, -1.95286306,  2.34740965,
        0.96849691, -0.75938718,  0.90219827])

In [30]:
y

array([-0.46695317, -0.06068952,  0.78884434, -1.25666813,  0.57585751,
        1.39897899,  1.32229806, -0.29969852])

In [31]:
# In this example, numpy.maximum computed the element-wise maximum of the elements in x and y
# (compares and returns the greater value of the two)

np.maximum(x,y)

array([-0.46695317,  0.6488928 ,  0.78884434, -1.25666813,  2.34740965,
        1.39897899,  1.32229806,  0.90219827])

In [33]:
# While not common, a ufunc can return multiple arrays. numpy.modf() is one example:
# a vectorized version of the built-in Python math.modf(), it returns the fractional and
# integral parts of a floating-point array:

arr = rng.standard_normal(7) * 5
arr

array([ 4.51459671, -8.10791367, -0.7909463 ,  2.24741966, -6.71800536,
       -0.40843795,  8.62369966])

In [34]:
remainder, whole_part = np.modf(arr)
remainder

array([ 0.51459671, -0.10791367, -0.7909463 ,  0.24741966, -0.71800536,
       -0.40843795,  0.62369966])

In [35]:
whole_part

array([ 4., -8., -0.,  2., -6., -0.,  8.])

In [36]:
# Ufuncs accept an optional out argument that allows them to assign their results into
# an existing array rather than create a new one:

arr

array([ 4.51459671, -8.10791367, -0.7909463 ,  2.24741966, -6.71800536,
       -0.40843795,  8.62369966])

In [37]:
out = np.zeros_like(arr)

np.add(arr,1)

array([ 5.51459671, -7.10791367,  0.2090537 ,  3.24741966, -5.71800536,
        0.59156205,  9.62369966])

In [38]:
np.add(arr, 1, out=out)

array([ 5.51459671, -7.10791367,  0.2090537 ,  3.24741966, -5.71800536,
        0.59156205,  9.62369966])

In [39]:
out

array([ 5.51459671, -7.10791367,  0.2090537 ,  3.24741966, -5.71800536,
        0.59156205,  9.62369966])

# 4.4 Array-Oriented Programming with Arrays

Using NumPy arrays enables you to express many kinds of data processing tasks as
concise array expressions that might otherwise require writing loops. This practice
of replacing explicit loops with array expressions is referred to by some people
as vectorization. In general, vectorized array operations will usually be significantly
faster than their pure Python equivalents, with the biggest impact in any kind of
numerical computations

In [40]:
# suppose we wished to evaluate the function sqrt(x^2 + y^2) across a regular grid of values.
# The numpy.meshgrid() function takes two onedimensional arrays and produces two two-dimensional matrices
# corresponding to all pairs of (x, y) in the two arrays

points = np.arange(-5, 5, 0.01) # 100 equally spaced points

xs, ys = np.meshgrid(points, points)

In [41]:
ys

array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ...,
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])

In [42]:
# Now, evaluating the function is a matter of writing the same expression you would write with two points:

z = np.sqrt(xs ** 2 + ys ** 2)

z

array([[7.07106781, 7.06400028, 7.05693985, ..., 7.04988652, 7.05693985,
        7.06400028],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       ...,
       [7.04988652, 7.04279774, 7.03571603, ..., 7.0286414 , 7.03571603,
        7.04279774],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568]])

### Expressing Conditional Logic as Array Operations

In [43]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])

yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])

cond = np.array([True, False, True, True, False])

In [44]:
# Suppose we wanted to take a value from xarr whenever the corresponding value in
# cond is True, and otherwise take the value from yarr. A list comprehension doing
# this might look like:

result = [(x if c else y)
          for x, y, c in zip(xarr, yarr, cond)]

result

[1.1, 2.2, 1.3, 1.4, 2.5]

In [46]:
# This has multiple problems. First, it will not be very fast for large arrays (because all
# the work is being done in interpreted Python code). Second, it will not work with multidimensional arrays.
# With numpy.where() you can do this with a single function call:

result = np.where(cond, xarr, yarr)
result

array([1.1, 2.2, 1.3, 1.4, 2.5])

In [47]:
# The second and third arguments to numpy.where() don’t need to be arrays; one or
# both of them can be scalars. A typical use of where in data analysis is to produce a
# new array of values based on another array. Suppose you had a matrix of randomly
# generated data and you wanted to replace all positive values with 2 and all negative
# values with –2. This is possible to do with numpy.where():

arr = rng.standard_normal((4,4))
arr

array([[ 2.61815943,  0.77736134,  0.8286332 , -0.95898831],
       [-1.20938829, -1.41229201,  0.54154683,  0.7519394 ],
       [-0.65876032, -1.22867499,  0.25755777,  0.31290292],
       [-0.13081169,  1.26998312, -0.09296246, -0.06615089]])

In [48]:
arr > 0

array([[ True,  True,  True, False],
       [False, False,  True,  True],
       [False, False,  True,  True],
       [False,  True, False, False]])

In [49]:
np. where(arr > 0, 2, -2)

array([[ 2,  2,  2, -2],
       [-2, -2,  2,  2],
       [-2, -2,  2,  2],
       [-2,  2, -2, -2]])

In [50]:
# You can combine scalars and arrays when using numpy.where(). For example, I can
# replace all positive values in arr with the constant 2, like so:

np. where(arr > 0, 2, -2) # set only positive values to 2

array([[ 2,  2,  2, -2],
       [-2, -2,  2,  2],
       [-2, -2,  2,  2],
       [-2,  2, -2, -2]])

### Mathematical and Statistical Methods

A set of mathematical functions that compute statistics about an entire array or
about the data along an axis are accessible as methods of the array class. You can
use aggregations (sometimes called reductions) like sum, mean, and std (standard
deviation) either by calling the array instance method or using the top-level NumPy
function. When you use the NumPy function, like numpy.sum(), you have to pass the
array you want to aggregate as the first argument.


In [51]:
arr = rng.standard_normal((5,4))
arr

array([[-1.10821447,  0.13595685,  1.34707776,  0.06114402],
       [ 0.0709146 ,  0.43365454,  0.27748366,  0.53025239],
       [ 0.53672097,  0.61835001, -0.79501746,  0.30003095],
       [-1.60270159,  0.26679883, -1.26162378, -0.07127081],
       [ 0.47404973, -0.41485376,  0.0977165 , -1.64041784]])

In [52]:
arr.mean()

-0.08719744457434529

In [53]:
np.mean(arr)

-0.08719744457434529

In [54]:
arr.sum()

-1.743948891486906

In [55]:
# Functions like mean and sum take an optional axis argument that computes the
# statistic over the given axis, resulting in an array with one less dimension:

arr.mean(axis=1)

array([ 0.10899104,  0.3280763 ,  0.16502112, -0.66719934, -0.37087634])

In [56]:
arr.sum(axis=0)

array([-1.62923076,  1.03990647, -0.33436331, -0.82026129])

In [None]:
# Here, arr.mean(axis=1) means “compute mean across the columns,” where
# arr.sum(axis=0) means “compute sum down the rows.”

In [57]:
# Other methods like cumsum and cumprod do not aggregate, instead producing an array
# of the intermediate results:

arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])

arr.cumsum()

array([ 0,  1,  3,  6, 10, 15, 21, 28], dtype=int32)

In [58]:
# In multidimensional arrays, accumulation functions like cumsum return an array of
# the same size but with the partial aggregates computed along the indicated axis
# according to each lower dimensional slice:

arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [59]:
# The expression arr.cumsum(axis=0) computes the cumulative sum along the rows,
# while arr.cumsum(axis=1) computes the sums along the columns:

arr.cumsum(axis=0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

In [61]:
arr.cumsum(axis=1)

array([[ 0,  1,  3],
       [ 3,  7, 12],
       [ 6, 13, 21]], dtype=int32)

### Methods for Boolean Arrays

In [62]:
# Boolean values are coerced to 1 (True) and 0 (False) in the preceding methods. Thus,
# sum is often used as a means of counting True values in a Boolean array:

arr = rng.standard_normal(100)

(arr > 0).sum() #number of positive values

48

In [63]:
(arr <= 0).sum() #number of non-positive values

52

In [64]:
# The parentheses here in the expression (arr > 0).sum() are necessary to be able to
# call sum() on the temporary result of arr > 0.

# Two additional methods, any and all, are useful especially for Boolean arrays. any
# tests whether one or more values in an array is True, while all checks if every value is True:

bools = np.array([False, False, True, False])

bools.any()

True

In [65]:
bools.all()

False

In [None]:
# These methods also work with non-Boolean arrays, where nonzero elements are treated as True.

### Sorting

In [66]:
# Like Python’s built-in list type, NumPy arrays can be sorted in place with the sort method:

arr = rng.standard_normal(6)
arr

array([ 0.07726066, -0.68391322, -0.72083767,  1.12062282, -0.05481416,
       -0.08241372])

In [68]:
arr.sort()
arr

array([-0.72083767, -0.68391322, -0.08241372, -0.05481416,  0.07726066,
        1.12062282])

In [69]:
# You can sort each one-dimensional section of values in a multidimensional array in
# place along an axis by passing the axis number to sort. In this example data:

arr = rng.standard_normal((5, 3))
arr

array([[ 0.9359865 ,  1.23853712,  1.27279553],
       [ 0.40589222, -0.05032522,  0.28931754],
       [ 0.17930568,  1.39748056,  0.29204679],
       [ 0.63840567, -0.02788771,  1.37105185],
       [-2.05280763,  0.38050908,  0.75539067]])

In [70]:
# arr.sort(axis=0) sorts the values within each column, while arr.sort(axis=1) sorts across each row:

arr.sort(axis=0)
arr

array([[-2.05280763, -0.05032522,  0.28931754],
       [ 0.17930568, -0.02788771,  0.29204679],
       [ 0.40589222,  0.38050908,  0.75539067],
       [ 0.63840567,  1.23853712,  1.27279553],
       [ 0.9359865 ,  1.39748056,  1.37105185]])

In [71]:
arr.sort(axis=1)
arr

array([[-2.05280763, -0.05032522,  0.28931754],
       [-0.02788771,  0.17930568,  0.29204679],
       [ 0.38050908,  0.40589222,  0.75539067],
       [ 0.63840567,  1.23853712,  1.27279553],
       [ 0.9359865 ,  1.37105185,  1.39748056]])

In [72]:
# The top-level method numpy.sort returns a sorted copy of an array (like the Python
# built-in function sorted) instead of modifying the array in place. For example:

arr2 = np.array([5, -10, 7, 1, 0, -3])

sorted_arr2 = np.sort(arr2)

sorted_arr2

array([-10,  -3,   0,   1,   5,   7])

### Unique and Other Set Logic

In [73]:
# NumPy has some basic set operations for one-dimensional ndarrays. A commonly
# used one is numpy.unique, which returns the sorted unique values in an array:

names = np.array(["Bob", "Will", "Joe", "Bob", "Will", "Joe", "Joe"])

np.unique(names)

array(['Bob', 'Joe', 'Will'], dtype='<U4')

In [74]:
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])

np.unique(ints)

array([1, 2, 3, 4])

In [75]:
# Contrast numpy.unique with the pure Python alternative:

sorted(set(names))

['Bob', 'Joe', 'Will']

In [76]:
# Another function, numpy.in1d, tests membership of the values in one array in
# another, returning a Boolean array:

values = np.array([6, 0, 0, 3, 2, 5, 6])

np.in1d(values, [2,3,6])

array([ True, False, False,  True,  True, False,  True])