In [2]:
import numpy as np
# https://numpy.org/doc/1.23/user/quickstart.html

In [9]:
# advanced numpy lecture 36:59
a = np.array([[100, 200, 50, 400], [50, 0, 0, 100], [350, 100, 50, 200]])
abytes = a.ravel().view(dtype=np.uint8) # uint8 is unsigned 8-bit integer 0-255, int8 is signed 8-bit integer -128 to 127


(12,)

In [3]:
# Initialization -------------------------------------------
# Dimensions
a0 = np.array(1)    # 1     0 dimension
a1 = np.array([1, 2, 3])  # [1 2 3]     1 dimensional array
a2 = np.array([[1, 2, 3], [4, 5, 6]])  # 2 dimensional array
a3 = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])  # 3 dimensional array
ax = np.array([1, 2], ndmin=5)    # [[[[[1 2]]]]]      force 5 dimensional array
a1.ndim # 1
a2.ndim # 2
a3.ndim # 3
combo = np.array(np.arrange(1, 5), np.arange(5, 9)) # [[1 2 3 4] [5 6 7 8]]
# Autogenerate
zeroes = np.zeros((2, 2))  # Create an 2x2 array filled with zeroes (float, so 0.)
ones = np.ones((2, 2))  # Create an 2x2 array filled with ones (float, so 1.)
empty = np.empty((2, 2))  # Create an 2x2 array filled with random values - faster than zeros and ones
full = np.full((2, 2), 7)  # Create an 2x2 array filled with 7
full = np.full(a2.shape, 7)  # Create an array filled with 7, same shape as a2
identiy = np.eye(2)  # Create a 2x2 identity matrix
range_array = np.arange(0, 10, 2)  # [0 2 4 6 8], syntax same as range(): start, stop, step
linsspace = np.linspace(0, 10, 5)  # [ 0.0  2.5  5.0  7.5  10.0 ]
logspace = np.logspace(0, 1, 5, base=10) # [ 1.  1.77827941  3.16227766  5.62341325  10. ]

# Initialize large arrays
# create array first in full size, then fill in w/ assignment
epochs = range(100_000)
result_array = np.zeros((len(epochs),))  # zeroes() accept a tuple param, singleton tupe is syntax (num,)
for index, e in enumerate(epochs):
    value = e * 2   # do something
    result_array[index] = value
# even if you don't know the size beforehand, you can use resize()

In [None]:
# Memory saving / use Disk -------------------------------------------
# results = np.ones((600,600,600,6))        <= don't do this, use Disk for storage
# results[2,4,5,1] = 100

import h5py
hdf5_store = h5py.File("./cache.hdf5", mode="a")     # https://docs.h5py.org/en/stable/high/file.html 
results = hdf5_store.create_dataset("results", (600,600,600,6), dtype=np.float32, compression="gzip")
# do something
results[2,3,4,1] = 100

In [4]:
# Datatypes ----------------------------
# let numpy choose the datatype
x = np.array([1, 2])
x.dtype     # dtype('int32')
x = np.array([1.0, 2.0])
x.dtype     # dtype('float64')

# default type is float64
bar1 = np.ones(2)  # Create an array of ones of type int64
bar1.dtype    # dtype('float64')

# force a particular datatype
bar2 = np.array([1.0, 2.0], dtype=np.int8)    # 1 bytes = 8 bits
bar2.dtype    # dtype('int8')

dtype('int64')

In [5]:
# Dimensions, Size, Shape -------------------------------------------
# Shape: https://www.youtube.com/watch?v=ZVNbUuitv54&list=PLmzborXQMQuocnQ8TA9q5WTd6yFhNbIaV&index=3
# ndim, # of axis or dimension
# size, total # of elements
# shape: # of elements in tupe: dimension, 1st dimension contains 4 elements, 2nd dimension contains 2 elements, last dimension contains 3 elements

a = np.array([[[0, 1, 2, 3], [4, 5, 6, 7]], [[0, 1, 2, 3], [4, 5, 6, 7]], [[0, 1, 2, 3], [4, 5, 6, 7]]])
a.ndim      # 3
a.size      # 24
a.shape     # (3, 2, 4)  <= 4 elements in brackets, 2 yellows, 3 blues

(3, 2, 4)

In [20]:
# Reshaping / newaxis ----------------------------
a = np.arange(1, 7)     # [1 2 3 4 5 6]
b = a.reshape(2, 3)     # [[1 2 3] [4 5 6]]
c = a.reshape(3, 2)     # [[1 2] [3 4] [5 6]]

# newaxis is used to create a new axis in the data - add a blank axis to increase the dimension of the array
# needed when model require the data to be shaped in a certain manner
a.shape     # (6,)      <= it's a singleton tuple, a tuple with 1 element (6,), not 6 and something else. 
d = a[np.newaxis, :]    # [[1 2 3 4 5 6]]   <= add a new axis at the beginning
d.shape     # (1, 6)
e = a[:, np.newaxis]    # [[1] [2] [3] [4] [5] [6]]   <= add a new axis at the end
e.shape     # (6, 1)

# Flatten to 1D
# difference is that flatten() returns a copy of the array, while ravel() / reshape() returns a view / reference of the array
c.reshape(-1)   # [1 2 3 4 5 6]

k = np.array([[[[[1, 2, 3], [4, 5, 6]]], [[[1, 2, 3], [4, 5, 6]]]]])
k.shape     # (1, 2, 1, 2, 3)
k = k[:, np.newaxis]
k.shape     # (1, 1, 2, 1, 2, 3)

x = np.arange(10)
x.shape     # (10,)
x[:, np.newaxis, np.newaxis].shape   # (10, 1, 1)
x[np.newaxis, np.newaxis,:].shape   # (1, 1, 10)

(1, 1, 2, 1, 2, 3)

In [7]:
# Indexing --------------------------------
two_dim = np.array([[1, 2, 3], [4, 5, 6]]) 
two_dim[0, 0]   # 1
three_dim = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
three_dim[1, 1, 2]  # 12
# Slice
arr = np.arange(0,9) # [0 1 2 3 4 5 6 7 8]
arr[1:7] # [1 2 3 4 5 6]
arr[0:7] # [0 1 2 3 4 5 6]
# Slice (2)
two_dim[1, 1:]  # [5 6]     <= [1:] means slice to the end
two_dim[:, 1:3] # [[2 3] [5 6]]     <= [:, 1:3] means slice all rows, columns 1 and 2
three_dim[:, 1, :]  # [[4 5 6] [10 11 12]]     <= [:, 1, :] means all rows, row 1, all columns
# Devil in the details, potential BUG!
A = np.array([[n+m*10 for n in range(4)] for m in range(4)]) # [[0 1 2 3] [10 11 12 13] [20 21 22 23] [30 31 32 33]]
A[3:,:]     # [[30 31 32 33]]     <= 2 dimensional
A[3,:]      # [30 31 32 33]       <= 1 dimensional 


array([30, 31, 32, 33])

In [8]:
# Boolean indexing ----------------------------
a = np.array([[1, 2], [3, 4], [5, 6]])
# [[1 2]
#  [3 4]
#  [5 6]]

# same shape with True or False for the condition
bool_idx = (a > 2)
# [[False False]
#  [ True  True]
#  [ True  True]]

# note: this will be a rank 1 array
a[bool_idx]         #[3 4 5 6]
a[a > 2]            #[3 4 5 6]      # same as above, more concise

# modify the array
b = np.where(a > 2, a, -1)  # if a > 2, then a, else -1  <= ternary operator
# [[-1 -1]
#  [ 3  4]
#  [ 5  6]]

In [9]:
# Fancy indexing / Advanced indexing ----------------------------
# https://www.pythonlikeyoumeanit.com/Module3_IntroducingNumpy/AdvancedIndexing.html
# fancy indexing: pass a list of indices to access multiple array elements at once
a = np.array([10, 19, 30, 41, 50, 61])
b = a[[1, 3, 5]]  # [19 41 61]

# compute indices where condition is true
even = np.argwhere(a%2==0).flatten()  # [0 2 4]     <= these are indices
a_even = a[even]                      # [10 30 50]  <= pass the indices as param in fancy indexing to get the values

# arrays of indices passed as param are Parallel processed, like zip list
A = np.array([[n+m*10 for n in range(4)] for m in range(4)])    # [[0 1 2 3] [10 11 12 13] [20 21 22 23] [30 31 32 33]]
A[[1, 2, 0], [1, 3, -1]]   # [11 23 3]     <= [1, 1], [2, 3], [0, 3]     <= [row, column]  

array([11, 23,  3])

In [10]:
# nonzero, where, masking --------------------------------
# https://numpy.org/doc/stable/reference/generated/numpy.nonzero.html

# nonzero: returns a tuple of arrays, paraelle processing the indices of nonzero elements, like zip list
x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
x.nonzero()    # (array([0, 1, 2, 2]), array([0, 1, 0, 1]))  <= (row, column), so index (0,0), (1,1), (2,0), (2,1)
x[[0, 1, 2, 2], [0, 1, 0, 1]]   # [3 4 5 6]
np.transpose(x.nonzero())
# ([[0, 0],
#  [1, 1],
#  [2, 0],
#  [2, 1]])

# where
from numpy import where
a = np.array([10, 20, 30, 40, 50])
b = np.array([80, 40, 20, 10, 90])
c = a>b     # [False  True  True  True False]
c = where(a>b, a, b)    # [80 20 30 40 90]      <= ternary operator

# where (2)
x = np.arange(1, 7, 0.5)    # [1.  1.5 2.  2.5 3.  3.5 4.  4.5 5.  5.5 6. ]     <= 11 numbers   
mask = (3 < x) & (x < 6.5)  # [False False False False  True  True  True  True  True  True False]
# and = &, or = |, not = ~
indices = np.where(mask)    # (array([4, 5, 6, 7, 8, 9]),)     <= indices of True
x[indices]                 # [3.  3.5 4.  4.5 5.  5.5]       <= values of True

# Masking (1)
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
a > 3    # [[False False False] [ True  True  True] [ True  True  True]]
np.nonzero(a > 3)   # (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))  <= (row, column), so 4 = (1, 1) index
(a > 3).nonzero()   # (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))  <= (row, column)
a[a > 3]    # [4 5 6 7 8 9]
a[a < 4] = 0   # [[0 0 0] [4 5 6] [7 8 9]]

# Masking (2)
B = np.array([n for n in range(5)])   # [0 1 2 3 4]
B[np.array([True, False, True, False, False])]  # [0 2]
B[np.array([1,0,1,0,0], dtype=bool)]    # [0, 2]


array([0, 2])

In [11]:
# Stacking, concatenating, repeat, tile ----------------------------
# create large vectors and matrices from smaller ones
# https://numpy.org/doc/stable/reference/routines.array-manipulation.html
# Concatenation
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
# axis=0: column, axis=1: rows, axis=2: depth(z axis), axis=None: flatten
# https://stackoverflow.com/questions/17079279/how-is-axis-indexed-in-numpys-array
c = np.concatenate((a, b), axis=None)  # [1 2 3 4 5 6]   <= flatten the array
d = np.concatenate((a, b), axis=0)     # [[1 2] [3 4] [5 6]]   <= concatenate along axis 0
e = np.concatenate((a, b.T), axis=1)   # [[1 2 5] [3 4 6]]   <= concatenate along axis 1

# hstack: stack arrays in sequence horizontally (column wise)
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])
c = np.hstack((a, b))  # [1 2 3 4 5 6 7 8]

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])
c = np.hstack((a, b))  # [[1 2 5 6] [3 4 7 8]]

# vstack: stack arrays in sequence vertically (row wise)
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])
c = np.vstack((a, b, a)) # [[1 2 3 4] [5 6 7 8] [1 2 3 4]]

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])
c = np.vstack((a, b))  # [[1 2] [3 4] [5 6] [7 8]]

# repeat
a = np.array([[1, 2], [3, 4]])
np.repeat(a, 3)     # [1 1 1 2 2 2 3 3 3 4 4 4]     <= repeat each element 3 times
np.repeat(a, 3, axis=0)  # [[1 2] [1 2] [1 2] [3 4] [3 4] [3 4]]
np.repeat(a, 3, axis=1) # [[1 1 1 2 2 2] [3 3 3 4 4 4]]

# tile
np.tile(a, 3)       # [[1 2 1 2 1 2] [3 4 3 4 3 4]]   <= tile the array 3 times

In [12]:
# Broadcasting ----------------------------
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
b = np.array([1, 0, 1])
c = a + b  # [[ 2  2  4] [ 5  5  7] [ 8  8 10] [11 11 13]]  <= add value to each row of the array using broadcasting

In [13]:
# Statistical methods ----------------------------
# sum() - sum of all elements
a = np.array([[7, 8, 9, 10, 11, 12, 13], [17, 18, 19, 20, 21, 22, 23]])
a.sum()     # 210   <= sum all the elements, default axis is None
a.sum(axis=None)  # 210
a.sum(axis=0)   # [24 26 28 30 32 34 36]   <= sum along axis 0
a.sum(axis=1)   # [70 98]   <= sum along axis 1

# mean() - mean of all elements
a.mean()    # 15.0  <= mean of all the elements, default axis is None
a.mean(axis=None)  # 15.0
a.mean(axis=0)  # [12. 13. 14. 15. 16. 17. 18.]   <= mean along axis 0
a.mean(axis=1)  # [10. 14.]   <= mean along axis 1

# min(), max()
a = np.arange(1, 10).reshape(3, 3)  # [[1 2 3] [4 5 6] [7 8 9]]
a.min()     # 1  <= min of flattened array
a.min(0)    # [1 2 3]   <= min along axis 0 (columns)
a.min(1)    # [1 4 7]   <= min along axis 1 (rows)

# more functions: std, var, min, max, argmin, argmax, median, percentile, sort, etc
# sort  https://numpy.org/doc/1.23/reference/routines.sort.html

array([1, 4, 7])

In [14]:
# Copying , Referencing (View) ----------------------------
# Referencing (1)
a = np.array([1, 2, 3])
b = a   # b is a reference to a, only copies reference
b[0] = 5
a   # [5 2 3]
b   # [5 2 3]

# Referencing (2)
arr = np.arange(10)     # [0 1 2 3 4 5 6 7 8 9]
arr_part = arr[5:8]     # [5 6 7]
arr_part[:] = 666
arr                     # [  0   1   2   3   4 666 666 666   8   9]
arr[:] = 999
arr_part                # [999 999 999]

# Copying
a = np.array([1, 2, 3])
b = a.copy()    # b is a copy of a, copies the data
b[0] = 5
a   # [1 2 3]
b   # [5 2 3]

array([5, 2, 3])

In [24]:
# Random numbers ----------------------------
# https://numpy.org/doc/1.16/reference/routines.random.html

# param is a tuple, random numbers btwn 0 and 1, uniform distri,
a = np.random.random((3,2)) # [[0.123 0.456] [0.789 0.123] [0.456 0.789]]

# param is comma separated list
# Note: the numbers are not in a tuple, but are comma separated
a = np.random.rand(3,2)     # [[0.123 0.456] [0.789 0.123] [0.456 0.789]]
b = np.random.randn(3,2)     # [[-0.123 0.456] [0.789 -0.123] [-0.456 0.789]]  <= normal distribution, mean 0, std 1
e = np.random.randint(3, 10, size=(3,3))    # [[9 3 9] [8 4 6] [5 9 7]]  <= random integers btwn 3 and 10, uniform distribution mean 6.5, std 2.5
f = np.random.choice(7, size=10)    # [1 2 3 4 5 6 7 1 2 3]  <= random integers btwn 1 and 7, uniform distribution
g = np.random.choice([2, 3, 5, 7], size=8)  # [2 3 5 7 2 3 5 7]  <= random integers from the list, uniform distribution

array([[3, 9, 3],
       [8, 6, 4],
       [5, 9, 7]])

In [None]:
# Conditional ----------------------------
# https://numpy.org/doc/1.16/reference/routines.logic.html
M = np.array([[1, 2], [3, 4]])   # if statements, one needs to use any() or all(), which requires any or all elements in array eval to True
if (M > 5).any():
    print("at least one element in M is greater than 5")
else:
    print("No element in M is greater than 5")

if (M > 5).all():
    print("all elements in M are greater than 5")
else:
    print("all elements in M are not greater than 5")


In [6]:
# Iteration ----------------------------
a = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])   # 3D array, shape (2, 2, 2)
for x in np.nditer(a):
    print(x, end=" ")   # 1 2 3 4 5 6 7 8 9 10 11 12

for idx, x in np.ndenumerate(a):    # ndenumerate() returns a tuple of index and value
    print(idx, x, end=" ")   # (0, 0, 0) 1  (0, 0, 1) 2  (0, 0, 2) 3  (0, 1, 0) 4  (0, 1, 1) 5  (0, 1, 2) 6  (1, 0, 0) 7  (1, 0, 1) 8  (1, 0, 2) 9  (1, 1, 0) 10  (1, 1, 1) 11  (1, 1, 2) 12

1 2 3 4 5 6 7 8 9 10 11 12 (0, 0, 0) 1 (0, 0, 1) 2 (0, 0, 2) 3 (0, 1, 0) 4 (0, 1, 1) 5 (0, 1, 2) 6 (1, 0, 0) 7 (1, 0, 1) 8 (1, 0, 2) 9 (1, 1, 0) 10 (1, 1, 1) 11 (1, 1, 2) 12 

In [16]:
# Linear algebra ----------------------------
# Eigenvalues and eigenvectors
a = np.array([[1,2], [3,4]])
eigenvalues, eigenvectors = np.linalg.eig(a)    # eigenvalues: [5. -2.], eigenvectors: [[-0.707 -0.447] [0.707 -0.894]]

# Solve linear systems (Ax = b)
A = np.array([[1, 1], [1.5, 4]])
b = np.array([2200, 5050])
# Non-example: calc inverse is slow and less accurate, don't use it
x = np.linalg.inv(A).dot(b) # [1500.  700.]
# Instead use:
x = np.linalg.solve(A, b)   # [1500.  700.]

In [3]:
# Vectorize ----------------------------
# make function to handle a vector input
def Theta(x):
    if x >= 0:
        return 1
    else:
        return 0
# Theta(np.array([-3,-2,-1,0,1,2,3]))  # TypeError: because Theta function can't handle a vector input
Theta_vec = np.vectorize(Theta)     # vectorize the function
Theta_vec(np.array([-3, -2, -1, 0, 1, 2, 3]))  # [0 0 0 1 1 1 1]

# Implement function to accept a vector input from the beginning (more effort, but better performance)
def ThetaV2(x):
    return 1 * (x >= 0)
ThetaV2(np.array([-3, -2, -1, 0, 1, 2, 3])) # [0 0 0 1 1 1 1]
ThetaV2(-1.2)   # 0     <= still works for scalar input too
ThetaV2(2.6)   # 1

In [6]:
# File IO ----------------------------
# Loading data from CSV
# np.loadtxt(), skiprows=1
data = np.loadtxt('my_file.csv', delimiter=',', skiprows=1, dtype=np.float32)

# np.genfromtxt(), similar but slightly more configuration params
# skip_header=0, missing_values="---", filling_values=0.0, ...
data = np.genfromtxt('my_file.csv', delimiter=',', skip_header=1, dtype=np.float32)

#Write data to CSV
M = np.random.rand(3,3)     # [[0.123 0.456 0.789] [0.123 0.456 0.789] [0.123 0.456 0.789]]
np.savetxt("random-matrix.csv", M, fmt='%.5f', delimiter=",")     # fmt specifies the format of the data, delimiter specifies the separator


FileNotFoundError: my_file.csv not found.