In [5]:
import sys
#the convention is to import numpy as np
import numpy as np

# Array creation

In [2]:
array_1d = np.array([4, 5, 3])
type(array_1d)

numpy.ndarray

In [3]:
len(array_1d)

3

In [4]:
array_1d.shape

(3,)

In [5]:
array_1d.ndim

1

In [6]:
matrix = np.array([
    [ 1,2, 1 ],
    [5, 43, 5]
])

matrix

array([[ 1,  2,  1],
       [ 5, 43,  5]])

In [7]:
matrix.shape

(2, 3)

In [8]:
matrix.ndim

2

`np.eye(n)` creates an identity matrix shaped `nxn`

In [9]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

`np.zeros` creates an array with all its values set to zeros

In [10]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [11]:
np.zeros((3,2))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

`np.ones` creates an array with all its values set to ones (${\vec {1}}$)

In [12]:
np.ones(3)

array([1., 1., 1.])

`np.random` creates an array with random values in the interval [0, 1)

In [13]:
np.random.random(2)

array([0.76634763, 0.23251991])

In [14]:
np.random.random((2,3))

array([[0.71840538, 0.7670865 , 0.83282447],
       [0.41445022, 0.72660421, 0.57117472]])

We can read text as numpy arrays

In [15]:
!cat np_text.txt

1,2,3
43, 2,3
34,1,1
0,1,1

In [16]:
np_text = np.genfromtxt("np_text.txt", delimiter=",")
np_text

array([[ 1.,  2.,  3.],
       [43.,  2.,  3.],
       [34.,  1.,  1.],
       [ 0.,  1.,  1.]])

# Advantages of np.array versus lists

In [17]:
array_1d = np.array([1,2,3])

In [18]:
array_1d.shape

(3,)

In [1]:
sys.getsizeof?

Object `sys.getsizeof` not found.


In [19]:
list_2d = [[1222,2222,2223], [5,23,40004]]

array_2d = np.array([[1222,2222,2223], [5,23,40004]])

print("Size of the list in memory: {} bytes".format(
    sys.getsizeof(list_2d)))
print("Size of the numpy array in memory: {} bytes".format(
    sys.getsizeof(array_2d)))

Size of the list in memory: 80 bytes
Size of the numpy array in memory: 160 bytes


It seems like the numpy array takes up more space! However, this was a very simple example. Let's see what happens if we work with bigger arrays.

In [21]:
big_list = list(range(1000000))
big_array = np.array(range(1000000))

print("Size of the list in memory: {:.2f} MegaBytes".format(
    sys.getsizeof(big_list)/(1024**2)))
print("Size of the numpy array in memory: {:.2f} MegaBytes".format(
    sys.getsizeof(big_array)/(1024**2)))

Size of the list in memory: 8.58 MegaBytes
Size of the numpy array in memory: 7.63 MegaBytes


We can run `%%timeit` with the parameter `-r` to specify the number of times to run

In [22]:
%%timeit -r 5
sum(range(1000000))

14.2 ms ± 560 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)


In [23]:
%%timeit -r 5
np.sum(np.arange(1000000))

1.42 ms ± 60.8 µs per loop (mean ± std. dev. of 5 runs, 1000 loops each)


We can see using numpy is 10x faster. This is because numpy arrays are much simpler objects that python lists (that come with lots of "batteries built in" that slow them down), besides that, numpy arrays allocate all of their data in contiguous data blocks, so accessing them is faster

![](https://jakevdp.github.io/PythonDataScienceHandbook/figures/array_vs_list.png)

In order to really make use of numpy's optimizations, all of the elements in the array must be of the same type. If that is not the case, numpy will try to convert all the elements to the same data type.

In [24]:
np.array([1.0, 5.4])

array([ 1. ,  5.4])

In [25]:
np.array([1, 3.9])

array([ 1. ,  3.9])

We see numpy transformed the integer into a float

In [26]:
np.array([1,1,1,1,1, "1"])

array(['1', '1', '1', '1', '1', '1'], 
      dtype='<U21')

We can do `dtype` to see the array data type.

In [27]:
np.array([1, 3.9]).dtype

dtype('float64')

In [28]:
np.array([1, "1"])

array(['1', '1'], 
      dtype='<U21')

<hr>

### END OF BASIC NUMPY USAGE

If numpy cant optimize an array, it will use the datatype `object`, which means that the elements will be regular python objects.

In [29]:
from datetime import datetime
today = datetime.utcnow()

In [30]:
today

datetime.datetime(2018, 6, 26, 16, 21, 49, 355033)

In [31]:
from datetime import datetime

np.array([1, datetime.utcnow()])

array([1, datetime.datetime(2018, 6, 26, 16, 21, 53, 581876)], dtype=object)

We can use `np.vectorize` to vectorize functions (so they run faster), however if we work with python objects the performance wont improve.

In [32]:
format_string = lambda x: "{}".format(x)

In [33]:
format_string(1)

'1'

In [46]:
%%timeit -r 5 -n 100
["{}".format(i) for i in range(1000)]

242 µs ± 58.3 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)


In [47]:
f = np.vectorize(lambda x: "{}".format(x))

In [48]:
%%timeit -r 5 -n 100
f(np.arange(1000))

394 µs ± 56 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)


# Slicing

It is very easy to take subsections of numpy arrays, this is called *"slicing"* (since we take a slice of the array)

In [49]:
matrix_34 = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
matrix_34

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

We get the first row the same way we would do with a regular python list

In [50]:
matrix_34[0]

array([1, 2, 3, 4])

We can choose the first 2 rows

In [51]:
matrix_34[:2]

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

We can select the 2nd element of each row (that is, the 2nd column):

In [52]:
matrix_34[:,1]

array([ 2,  6, 10])

When we are slicing we dont get copies, but references to the same elements in the original array:

In [53]:
two_first_rows = matrix_34[:2,:]
two_first_rows

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [54]:
print(matrix_34)
two_first_rows[0, 0] = 100
matrix_34

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


array([[100,   2,   3,   4],
       [  5,   6,   7,   8],
       [  9,  10,  11,  12]])

However, if we assign the slice to a variable, changing this variable doesnt affect the original array

In [55]:
selection = matrix_34[1,3]
print(selection)
selection = 100
matrix_34

8


array([[100,   2,   3,   4],
       [  5,   6,   7,   8],
       [  9,  10,  11,  12]])

# Filtering

In [56]:
matrix_32 = np.array([[1, 4], [2, 4], [5, 0]])
matrix_32

array([[1, 4],
       [2, 4],
       [5, 0]])

In [57]:
filter_index = (matrix_32 >= 2)
filter_index

array([[False,  True],
       [ True,  True],
       [ True, False]], dtype=bool)

In [58]:
matrix_32[matrix_32>3]

array([4, 4, 5])

In [59]:
matrix_32[filter_index]

array([4, 2, 4, 5])

## Aritmethic operations with numpy arrays

In [6]:
array1 = np.array([[2,3],[0,1]])
array2 = np.array([[23,6],[0,42]])

print(array1)
print(array2)

[[2 3]
 [0 1]]
[[23  6]
 [ 0 42]]


Addition works with numpy arrays (it sums the two arrays element-wise)

In [7]:
array1 + array2

array([[25,  9],
       [ 0, 43]])

Same thing applies to the other operators (`-,/ and *`)

In [8]:
array1 * array2

array([[46, 18],
       [ 0, 42]])

In [9]:
array1/array2

  """Entry point for launching an IPython kernel.


array([[0.08695652, 0.5       ],
       [       nan, 0.02380952]])

In [105]:
array1-array2

array([[-21,  -3],
       [  0, -41]])

We can use the symbol `@` to perform a matrix multiplication (same thing than doing `np.dot(arr1, arr2)`*(Note: this only works with python>=3.5)*.

In [106]:
array1 @ array2

array([[ 46, 138],
       [  0,  42]])

Is the equivalent to:

In [107]:
array1.dot(array2)

array([[ 46, 138],
       [  0,  42]])

Like in regular linear algebra, the dimensions of the matrices we are trying to multiply must be compatible (second dimension of the first one = first dimension of the second one):

In [108]:
array1 = np.array([[2,3],[0,1]])

array2 = np.array([[2,3],[0,1], [7,5]])

array1 @ array2

ValueError: shapes (2,2) and (3,2) not aligned: 2 (dim 1) != 3 (dim 0)

In this particular scenario we can transpose the second array to make it match the first one.*transposing a matrix means swapping its rows and columns*

In [109]:
array2

array([[2, 3],
       [0, 1],
       [7, 5]])

In [111]:
array1 @ array2.T

array([[13,  3, 29],
       [ 3,  1,  5]])