In [1]:
import numpy as np

# Numpy Array and Properties

Unlike Python list, the elements of a numpy array have the same data type.  
Numpy arrays allow vectorised computation which is much faster than loops.  
For machine learning application, which require large amount of computations, vectorised computation is mandatory.

In [386]:
a = range(1000000)
b = np.arange(0, 1e7, 1, dtype = 'int32')

In [387]:
def list_div(inp, div):
    out = []
    for i in inp:
        out.append(i/div)
    return out

In [388]:
%timeit q = list_div(a, 2)
%timeit q = b/2

167 ms ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
27 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Data types in numpy array

- bool_  
Boolean (True or False) stored as a byte  
- int_  
Default integer type (same as C long; normally either int64 or int32)  
- intc	  
Identical to C int (normally int32 or int64)  
- intp  
Integer used for indexing (same as C ssize_t; normally either int32 or int64)  
- int8  
Byte (-128 to 127)  
- int16  
Integer (-32768 to 32767)  
- int32  
Integer (-2147483648 to 2147483647)  
- int64  
Integer (-9223372036854775808 to 9223372036854775807)  
- uint8  
Unsigned integer (0 to 255)  
- uint16  
Unsigned integer (0 to 65535)  
- uint32  
Unsigned integer (0 to 4294967295)  
- uint64  
Unsigned integer (0 to 18446744073709551615)  
- float_  
Shorthand for float64  
- float16  
Half precision float: sign bit, 5 bits exponent, 10 bits mantissa  
- float32  
Single precision float: sign bit, 8 bits exponent, 23 bits mantissa  
- float64  
Double precision float: sign bit, 11 bits exponent, 52 bits mantissa  
- complex_  
Shorthand for complex128.  
- complex64  
Complex number, represented by two 32-bit floats  
- complex128  
Complex number, represented by two 64-bit floats  


##### Creating an array

In [126]:
# creating an array
a = np.array([[1,2],
              [3,4]], dtype = 'int')
print(a.shape)
print(a)

(2, 2)
[[1 2]
 [3 4]]


In [30]:
# create array of 1s with designated shape
a = np.ones((2,2), dtype = 'int')
print(a.shape)
print(a)

(2, 2)
[[1 1]
 [1 1]]


In [27]:
# create array of 0s with designated shape
a = np.zeros((3,3), dtype = 'float32')
print(a.shape)
print(a)

(3, 3)
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [35]:
# array full of an element
a = np.full(shape = (3,4),
            fill_value = 2)
print(a)

[[2 2 2 2]
 [2 2 2 2]
 [2 2 2 2]]


In [40]:
# identity matrix
a = np.eye(5)
print(a)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [114]:
# Create an array filled with a linear sequence
# Starting at 0, ending at 20, stepping by 2
# (this is similar to the built-in range() function)
a = np.arange(0, 20, 2)
print(a)

[ 0  2  4  6  8 10 12 14 16 18]


In [118]:
# Create an array filled with a linear sequence
# Create an array of five 21 evenly spaced between 0 and 21
a = np.linspace(0, 20, 21)
print(a)

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20.]


##### Create with random choices

In [45]:
# matrix with random integers
a = np.random.randint(low = 3, high = 10, size = (2,4))
print(a)

array([[9, 8, 9, 3],
       [7, 7, 9, 8]])

In [54]:
# random float between 0 and 1
a = np.random.rand(3, 4)
print(a)

[[0.79149041 0.63641728 0.75502792 0.74833868]
 [0.13297529 0.33191999 0.29693325 0.12263582]
 [0.56730759 0.70898625 0.23641769 0.94014293]]


In [58]:
# randomly pick elements from standard normal distribution (mean = 0, variance = 1)
a = np.random.randn(2,2)
print(a)

[[-0.20623117 -0.05000452]
 [ 0.04885306 -1.5924071 ]]


In [121]:
# randomly pick from a normal distribution with given mean and standard deviation
a = np.random.normal(loc = 3, scale = 2, size = (2,3))
print(a)

[[3.08432967 3.34525535 5.47638711]
 [1.17056208 1.73567436 3.20125153]]


##### Array properties

In [60]:
a = np.random.randn(3, 5)
print(a)

[[-0.94750327 -0.14570718  1.79210131  0.99735453 -1.14320907]
 [ 0.24813545 -0.75511727 -1.62875077 -0.80221798 -0.90230901]
 [ 1.46063906 -0.13240113  0.39441804  0.38129038 -0.65697071]]


In [129]:
# shape, size, number of dimensions, number of items
print(a.ndim)
print(a.dtype)
print(a.shape)
print(a.size)

2
int64
(2, 2)
4


In [134]:
# size of the array in bytes
print('size of each item in an array in bytes:', a.itemsize, 'bytes')
print('size of the array in bytes:', a.nbytes, 'bytes')

size of each item in an array in bytes: 8 bytes
size of the array in bytes: 32 bytes


# Array manipulation

## Concatenating

In [279]:
a = np.arange(0, 12, 1).reshape(4,3)
print(a)

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


In [280]:
# reshape
b = a.reshape(2, 6)
print(b)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]


In [281]:
# adding new dimension
b = a[:, :, np.newaxis]
c = a[np.newaxis, :, :]
print(a.shape)
print(b.shape)
print(c.shape)

(4, 3)
(4, 3, 1)
(1, 4, 3)


In [282]:
# stacking a list of arrays along a new axis
# basically converts a list of arrays into one array
seq = [np.random.randn(2, 4) for _ in range(10)]
print(seq[0].shape)
stacked = np.stack(seq)
print(stacked.shape)

(2, 4)
(10, 2, 4)


In [289]:
# merging two arrays
# vertically stacking a list of arrays
a = np.ones((3,3))
b = np.zeros((3,3))
c = np.vstack([a,b])
print(c)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [290]:
# in case of higher dimensional arrays, stacking along the first axis
a = np.random.randn(3,3,3)
b = np.random.randn(1,3,3)
c = np.vstack([a,b,b])
print(a.shape)
print(b.shape)
print(c.shape)

(3, 3, 3)
(1, 3, 3)
(5, 3, 3)


In [291]:
# merging two arrays
# vertically stacking a list of arrays
a = np.ones((3,3))
b = np.zeros((3,3))
c = np.hstack([a,b])
print(c)

[[1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0.]]


In [292]:
# in case of higher dimensional arrays, stacking along the second axis
a = np.random.randn(3,3,3)
b = np.random.randn(3,2,3)
c = np.hstack([a,b])
print(a.shape)
print(b.shape)
print(c.shape)

(3, 3, 3)
(3, 2, 3)
(3, 5, 3)


In [293]:
# generalisation of vstack and hstack.
# concatenating along a given axis
a = np.random.randn(3,3,3)
b = np.random.randn(3,3,3)
c = np.concatenate([a,b], axis = 0)
print(c.shape)
c = np.concatenate([a,b], axis = 1)
print(c.shape)
c = np.concatenate([a,b], axis = 2)
print(c.shape)

(6, 3, 3)
(3, 6, 3)
(3, 3, 6)


## Split

In [305]:
a = np.arange(0, 30, 1).reshape(3,10)
print(a)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]]


In [308]:
b = np.vsplit(a, [2])
print(b[0])
print('*******')
print(b[1])

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]]
*******
[[20 21 22 23 24 25 26 27 28 29]]


In [310]:
b = np.hsplit(a, [3,6])
print(b[0])
print('*******')
print(b[1])
print('*******')
print(b[2])

[[ 0  1  2]
 [10 11 12]
 [20 21 22]]
*******
[[ 3  4  5]
 [13 14 15]
 [23 24 25]]
*******
[[ 6  7  8  9]
 [16 17 18 19]
 [26 27 28 29]]


# Array indexing

## Two dimensional array

In [202]:
a = np.arange(0, 10, 1)
print(a)

[0 1 2 3 4 5 6 7 8 9]


In [204]:
# indexing a single element
a[2]

2

In [206]:
# indexing first five elements
a[:5]

array([0, 1, 2, 3, 4])

In [208]:
# indexing last five elements
a[-5:]

array([5, 6, 7, 8, 9])

In [209]:
# indexing a subset
a[2:-2]

array([2, 3, 4, 5, 6, 7])

In [215]:
# indexing with interval of 2 from 3rd element
a[3::2]

array([3, 5, 7, 9])

In [217]:
# inverse array
a[-3::-1]

array([7, 6, 5, 4, 3, 2, 1, 0])

## Multi-dimensional array

In [219]:
a = np.random.randint(low = 1, high = 10, size = (4, 8))
print(a)

[[8 3 6 5 1 4 6 5]
 [2 6 9 6 3 7 6 7]
 [7 8 1 1 4 4 6 7]
 [3 5 8 3 1 4 3 7]]


In [221]:
# index a single element
a[2,3]

1

In [222]:
# index a column
a[:, 1]

array([3, 6, 8, 5])

In [223]:
# index a rrow
a[1,:]

array([2, 6, 9, 6, 3, 7, 6, 7])

In [226]:
# index multiple columns
a[:, 2:4]

array([[6, 5],
       [9, 6],
       [1, 1],
       [8, 3]])

In [228]:
# index multiple rows
a[2:4, :]

array([[7, 8, 1, 1, 4, 4, 6, 7],
       [3, 5, 8, 3, 1, 4, 3, 7]])

In [231]:
# index multiple rows and columns
a[1:3, 3:5]

array([[6, 3],
       [1, 4]])

In [236]:
# simple way of indexing the first dimension
a = np.random.rand(4,3,2)
print(a[0].shape)

(3, 2)


Index subarray is a view rather than a copy.  
Changing a subarray will result in the change of the original array.

In [272]:
a = np.arange(0, 10, 1)
print(a)

[0 1 2 3 4 5 6 7 8 9]


## View & Copy

In [273]:
# change in subarray results in the change of the original array
sub_a = a[2:5]
print(sub_a)
sub_a[1] = 999
print(a)

[2 3 4]
[  0   1   2 999   4   5   6   7   8   9]


If the above behaviour is not what you are want, you need to explicitly create a copy.

In [274]:
a = np.arange(0, 10, 1)
print(a)

[0 1 2 3 4 5 6 7 8 9]


In [276]:
# change in subarray does not result in the change of the original array
sub_a = a[2:5].copy()
print(sub_a)
sub_a[1] = 999
print(a)

[2 3 4]
[0 1 2 3 4 5 6 7 8 9]


# Single array operations

In [327]:
a = np.random.randint(low = 0, high = 10, size = (3, 5))
print(a)

[[8 8 6 8 1]
 [3 5 3 7 3]
 [9 4 6 8 8]]


In [329]:
# Statistics of all elements of an array
print('Sum: ', np.sum(a))
print('Standard deviation: ',np.std(a))
print('Mean:', np.mean(a))
print('Median:',np.median(a))
print('Cumulative sum:', np.cumsum(a))
print('Cumulative product', np.cumprod(a))

Sum:  87
Standard deviation:  2.4000000000000004
Mean: 5.8
Median: 6.0
Cumulative sum: [ 8 16 22 30 31 34 39 42 49 52 61 65 71 79 87]
Cumulative product [          8          64         384        3072        3072        9216
       46080      138240      967680     2903040    26127360   104509440
   627056640  5016453120 40131624960]


In [331]:
# Row-wise or column wise statistics
# calculate along the given axis
print('sum along columns', np.sum(a, axis = 0))
print('standard deviation along rows', np.std(a, axis = 1))
print('mean along columns', np.mean(a, axis = 0))
print('Cumulative sum along columns', np.cumsum(a, axis = 1))
print('Cumulative product along columns', np.cumprod(a, axis = 1))

sum along columns [20 17 15 23 12]
standard deviation along rows [2.71293199 1.6        1.78885438]
mean along columns [6.66666667 5.66666667 5.         7.66666667 4.        ]
Cumulative sum along columns [[ 8 16 22 30 31]
 [ 3  8 11 18 21]
 [ 9 13 19 27 35]]
Cumulative product along columns [[    8    64   384  3072  3072]
 [    3    15    45   315   945]
 [    9    36   216  1728 13824]]


In [256]:
a = np.arange(0, 10, 1)
print(a)

[0 1 2 3 4 5 6 7 8 9]


Python's built-in aggregation function is understood by numpy arrays. But the computation is much slower.

In [332]:
a = np.random.randn(100000)

In [335]:
%timeit np.min(a)
%timeit min(a)

45.2 µs ± 2.44 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
6.66 ms ± 853 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [337]:
%timeit np.sum(a)
%timeit sum(a)

44.4 µs ± 6.38 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
8.41 ms ± 424 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Array computation

Numpy understands Python's built in arithmetic operators.  
In addition, Numpy has functions for each arithmetic operators.  
They can be used interchangably.

\+  
np.add	Addition (e.g., 1 + 1 = 2)  
\-	  
np.subtract	Subtraction (e.g., 3 - 2 = 1)  
\-	  
np.negative	Unary negation (e.g., -2)  
\*	  
np.multiply	Multiplication (e.g., 2 * 3 = 6)  
\/	  
np.divide	Division (e.g., 3 / 2 = 1.5)  
\//

np.floor_divide	Floor division (e.g., 3 // 2 = 1)  
\**	  

np.power	Exponentiation (e.g., 2 ** 3 = 8)  
\%  
np.mod	Modulus/remainder (e.g., 9 % 4 = 1)  

Unlike aggregation functions, they are computationally equivalent

In [339]:
a = np.random.randn(100000)
%timeit np.add(a, a)
%timeit a + a

56.4 µs ± 9.86 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
44.8 µs ± 4.82 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [99]:
# elementwise summation and substraction
c = a+b
print(c)

[[ 8  0 12]
 [ 4  7  7]
 [ 9  8  9]]


In [100]:
# elementwise multiplication
c = a*b
print(c)

[[15  0 32]
 [ 3 10 12]
 [14 12 14]]


In [101]:
# elementwise division
c = a/b
print(c)

[[0.6               nan 0.5       ]
 [0.33333333 2.5        0.75      ]
 [3.5        0.33333333 3.5       ]]


  c = a/b


##### Broadcasting

In [316]:
a = np.random.randint(low = 0, high = 9, size = (4,3))
b = np.random.randint(low = 0, high = 9, size = (1,3))
print(a)
print(b)

[[0 8 1]
 [2 4 4]
 [1 0 2]
 [1 7 5]]
[[8 4 4]]


In [317]:
a - b

array([[-8,  4, -3],
       [-6,  0,  0],
       [-7, -4, -2],
       [-7,  3,  1]])

In [318]:
a * b

array([[ 0, 32,  4],
       [16, 16, 16],
       [ 8,  0,  8],
       [ 8, 28, 20]])

# Trigonometry

In [322]:
theta = np.linspace(0, np.pi, 3)

In [321]:
print("theta      = ", theta)
print("sin(theta) = ", np.sin(theta))
print("cos(theta) = ", np.cos(theta))
print("tan(theta) = ", np.tan(theta))

theta      =  [0.         1.57079633 3.14159265]
sin(theta) =  [0.0000000e+00 1.0000000e+00 1.2246468e-16]
cos(theta) =  [ 1.000000e+00  6.123234e-17 -1.000000e+00]
tan(theta) =  [ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]


In [323]:
x = [-1, 0, 1]
print("x         = ", x)
print("arcsin(x) = ", np.arcsin(x))
print("arccos(x) = ", np.arccos(x))
print("arctan(x) = ", np.arctan(x))

x         =  [-1, 0, 1]
arcsin(x) =  [-1.57079633  0.          1.57079633]
arccos(x) =  [3.14159265 1.57079633 0.        ]
arctan(x) =  [-0.78539816  0.          0.78539816]


# Logarithm, Exponents

In [326]:
x = [1, 2, 4, 10]
print("x        =", x)
print("ln(x)    =", np.log(x))
print("log2(x)  =", np.log2(x))
print("log10(x) =", np.log10(x))

x        = [1, 2, 4, 10]
ln(x)    = [0.         0.69314718 1.38629436 2.30258509]
log2(x)  = [0.         1.         2.         3.32192809]
log10(x) = [0.         0.30103    0.60205999 1.        ]


In [325]:
x = [1, 2, 3]
print("x     =", x)
print("e^x   =", np.exp(x))
print("2^x   =", np.exp2(x))
print("3^x   =", np.power(3, x))

x     = [1, 2, 3]
e^x   = [ 2.71828183  7.3890561  20.08553692]
2^x   = [2. 4. 8.]
3^x   = [ 3  9 27]


# Comparison operation

## Comparison operation

In [359]:
x = np.arange(0, 9, 1)
x > 5

array([False, False, False, False, False, False,  True,  True,  True])

In [360]:
x = np.arange(0, 9, 1)
x >= 5

array([False, False, False, False, False,  True,  True,  True,  True])

In [351]:
x = np.arange(0, 9, 1)
x != 5

array([ True,  True,  True,  True,  True, False,  True,  True,  True])

In [349]:
x = np.arange(0, 9, 1)
x == 5

array([False, False, False, False, False,  True, False, False, False])

In [361]:
# are there any values greater than 5
x = np.arange(0, 9, 1)
np.any(x > 5)

True

In [362]:
# are all values greater than 5
np.all(x > 0.5)

False

In [352]:
a = np.random.random(100000)
a = a <= 0.5

In [353]:
# counting True values
%timeit np.count_nonzero(a)
%timeit np.sum(a)
%timeit sum(a)

7.43 µs ± 1.31 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
112 µs ± 4.47 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
269 ms ± 84.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [354]:
print(np.count_nonzero(a))
print(np.sum(a))
print(sum(a))

49934
49934
49934


## Boolean masking

In [367]:
a = np.random.randn(5,4)
print(a)

[[ 0.43903932 -1.71353799  1.39738867 -0.92849233]
 [-0.49209912 -0.49837736  0.53365281 -0.07040866]
 [ 2.42097781  1.70086237 -0.48937059 -1.79344543]
 [-0.23220884  0.50665781  0.72812805 -0.14770891]
 [ 0.03210276 -0.3834076   0.11671139 -0.62691052]]


In [368]:
a > 0

array([[ True, False,  True, False],
       [False, False,  True, False],
       [ True,  True, False, False],
       [False,  True,  True, False],
       [ True, False,  True, False]])

In [369]:
a[a>0]

array([0.43903932, 1.39738867, 0.53365281, 2.42097781, 1.70086237,
       0.50665781, 0.72812805, 0.03210276, 0.11671139])

# Useful functions

In [6]:
#count elements
a = np.random.randint(low = 1, high = 9, size = 10000)
print(np.unique(a, return_counts=True))

(array([1, 2, 3, 4, 5, 6, 7, 8]), array([1294, 1259, 1212, 1252, 1267, 1250, 1252, 1214]))


In [5]:
a

array([2, 5, 2, ..., 1, 6, 5])