# NumPy Basics: Arrays and Vectorized Computation

Here are some things you'll find in NumPy:
- `ndarray`, an efficient multidimensional array providing fast array-oriented arithmetic operations and flexible broadcasting capabilities.
- Mathematical functions for fast operations on entire arrays of data without having to write loops.
- Tools for reading/writing array data to disk and working with memory-mapped files.
- Linear algebra, random number generation, and Fourier transform capabilities.

Numpy is important for numerical computation in Python because it is desgined for efficiency on large arrays of data. It is also the foundation for many other packages in the scientific Python ecosystem, such as pandas, scikit-learn, and matplotlib.

In [None]:
# import numpy
import numpy as np

In [None]:
my_arr = np.arange(1_000_000)
%timeit my_arr2 = my_arr * 2

In [None]:
my_list = list(range(1_000_000))
%timeit my_list2 = [x * 2 for x in my_list]

NumPy algorithms are generally 10 to 100 times faster that their pure Python counterparts and use significantly less memory.

A ndarray is a generic multidimensional container for homogeneous data; that is, all of the elements must be the same type. Every array has a shape, a tuple indicating the size of each dimension, and a dtype, an object describing the data type of the array.

In [None]:
data = np.array([[1.5, -0.1, 3], [0, -3, 6.5]])
data

In [None]:
data * 10

In [None]:
data + data

In [None]:
data.shape

In [None]:
data.dtype

In [None]:
# produce an array of all 0s.
np.zeros(10)

In [None]:
np.zeros((4, 2))

In [None]:
# creates an array without initializing its values.
np.empty(10)

In [None]:
# creates an array of 1s.
np.ones((3, 6))

In [None]:
# create a square N x N identity matrix.
np.eye(4)

In [None]:
np.identity(2)

In [None]:
np.arange(15)

In [None]:
numeric_strings = np.array(["1.25", "-9.6", "42"], dtype=np.string_)
numeric_strings.astype(float)

Arrays are important because they enable you to express batch operations on data without writing any for loops. This is called vectorization. Any arithmetic operations between equal-size arrays applies the operation element-wise.

In [None]:
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
arr

In [None]:
1 / arr

In [None]:
arr**2

In [None]:
other_ar = np.array([[0.0, 4.0, 1.0], [7.0, 2.0, 12.0]])
arr > other_ar

In [None]:
arr[1]

In [None]:
# select a row, then select a column
arr[0, 2]

In [None]:
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
names

In [None]:
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]])
data

In [None]:
# comparison is also vectorized
names == "Bob"

In [None]:
data[names == "Bob"]

In [None]:
cond = names == "Bob"
data[~cond]

In [None]:
mask = (names == "Bob") | (names == "Will")
mask

In [None]:
data[mask]

In [None]:
data[data < 0] = 0
data

In [None]:
arr = np.zeros((8, 4))
arr

In [None]:
for i in range(8):
    arr[i] = i

arr

In [None]:
# to select a subset of the rows in a particular order you can simply pass a list or ndarray of integers specifying the desired order.
arr[[4, 3, 0, 6]]

In [None]:
# you can use negative indices to select rows from the end of the array.
arr[[-3, -5, -7]]
arr

In [None]:
arr = np.arange(32).reshape((8, 4))
arr

In [None]:
# passing multiple index arrays does something slightly different; it selects a one-dimensional array of elements corresponding to each tuple of indices.
arr[[1, 5, 7, 2], [0, 3, 1, 2]]

In [None]:
# inner matrix product
arr = np.array([[0, 1, 0], [1, 2, -2], [6, 3, 2], [-1, 0, -1], [1, 0, 1]])
np.dot(arr.T, arr)

In [None]:
# inner matriz product using the @ operator
arr.T @ arr

In [None]:
arr

In [None]:
# transpose a pair of axis
arr.transpose((1, 0))

The numpy.random module supplements the built-in Python random with functions for efficiently generating whole arrays of sample values from many kinds of probability distributions.

In [None]:
samples = np.random.normal(size=(4, 4))
samples

Python's built-in random module, by contrast, only samples one value at a time. As you can see, numpy.random is well-suited for performing a large number of random draws from a statistical distribution.

In [None]:
from random import normalvariate

N = 1_000_000

%timeit samples = [normalvariate(0, 1) for _ in range(N)]

In [None]:
%timeit np.random.normal(size=N)

In [None]:
rng = np.random.default_rng(seed=424216)
rng

In [None]:
data = rng.standard_normal((2, 3))
data

In [None]:
x = rng.standard_normal(8)
y = rng.standard_normal(8)

In [None]:
x

In [None]:
y

In [None]:
# element-wise maximum
np.maximum(x, y)

Using NumPy arrays makes it possible to express many kinds of data processing tasks as concise array expressions that can often be carried out without any for loops, leading to much cleaner and faster Python code.

In [None]:
points = np.arange(-5, 5, 0.01)
xs, ys = np.meshgrid(points, points)
ys

In [None]:
z = np.sqrt(xs**2 + ys**2)
z

In [None]:
# use matplotlib to plot this two-dimensional array
import matplotlib.pyplot as plt

plt.imshow(z, cmap=plt.cm.gray, extent=[-5, 5, -5, 5])
plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")

In [None]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])

cond = np.array([True, False, True, True, False])

In [None]:
# ternary expression
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
result

In [None]:
# ternary expression using np.where. The second and third arguments to np.where don’t need to be arrays
result = np.where(cond, xarr, yarr)
result

In [None]:
arr = rng.standard_normal((5, 4))
arr

In [None]:
arr.mean()

In [None]:
np.mean(arr)

In [None]:
arr.sum()

In [None]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])
arr.cumsum()

In [None]:
arr = rng.standard_normal(100)
(arr > 0).sum()  # Number of positive values

In [None]:
(arr <= 0).sum()  # Number of negative values

In [None]:
bools = np.array([False, False, True, False])

In [None]:
bools.any()

In [None]:
bools.all()

In [None]:
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
np.unique(names)

In [None]:
x = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
y = np.array([[6.0, 23.0], [-1, 7], [8, 9]])
x.dot(y)

In [None]:
np.dot(x, y)

In [None]:
from numpy.linalg import inv, qr

X = rng.standard_normal((5, 5))

In [None]:
mat = X.T @ X
mat

In [None]:
inv(mat)

In [None]:
mat @ inv(mat)

In [None]:
import random

position = 0
walk = [position]
steps = 1000

for _ in range(steps):
    step = 1 if random.randint(0, 1) else -1
    position += step
    walk.append(position)

In [None]:
plt.plot(walk[:100])

In [None]:
steps = 1000
rng = np.random.default_rng(seed=424216)
draws = rng.integers(0, 2, size=steps)
steps = np.where(draws == 0, 1, -1)
walk = steps.cumsum()

In [None]:
walk.max()

In [None]:
walk.min()

In [None]:
(np.abs(walk) >= 10).argmax()

In [None]:
nwalks = 5000
nsteps = 1000
draws = rng.integers(0, 2, size=(nwalks, nsteps))
steps = np.where(draws > 0, 1, -1)
walks = steps.cumsum(axis=1)
walks

In [None]:
walks.max()

In [None]:
walks.min()

In [None]:
hits30 = (np.abs(walks) >= 30).any(axis=1)
hits30

In [None]:
hits30.sum()

In [None]:
crossing_times = (np.abs(walks[hits30]) >= 30).argmax(axis=1)
crossing_times

In [None]:
crossing_times.mean()