_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 4 - NumPy Basics: Arrays and Vectorized Computation
### Part 1 - The NumPy ndarray: A Multidimensional Array Object

In [1]:
import numpy as np

In [2]:
# comparing numpy ndarray with plain list
an_array = np.arange(1000000)
a_list = list(range(1000000))
type(an_array), type(a_list)

(numpy.ndarray, list)

In [3]:
%time for _ in range(10): an_array * 2

Wall time: 83.3 ms


In [4]:
%time for _ in range(10): [x * 2 for x in a_list]

Wall time: 1 s


In [5]:
# 2-d ndarray, 2x3 random numbers from gaussian distribution
data = np.random.randn(2, 3)
data

array([[-0.77083469,  0.23786738,  0.68164017],
       [ 0.21080361, -0.41094009,  1.33501061]])

In [6]:
# operation is applied to each value in the array
data * 10

array([[-7.70834693,  2.37867381,  6.8164017 ],
       [ 2.10803611, -4.10940092, 13.35010614]])

In [7]:
# noteworthy ndarray attributes
data.shape, data.dtype, data.ndim

((2, 3), dtype('float64'), 2)

In [8]:
# costructor from a sequence-like object, like a list
d = np.array([3, 5, 6, 8])
d

array([3, 5, 6, 8])

In [9]:
# nested sequences lead to multidimesional array
d2 = np.array([[3, 5, 6, 8], [34, .5, 2, 3]])
d2

array([[ 3. ,  5. ,  6. ,  8. ],
       [34. ,  0.5,  2. ,  3. ]])

In [10]:
# arrays of zeroes
np.zeros((4, 3)), np.zeros(3, int)

(array([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]), array([0, 0, 0]))

In [11]:
# uninitialized array
np.empty((2, 2, 2))

array([[[ 3. ,  5. ],
        [ 6. ,  8. ]],

       [[34. ,  0.5],
        [ 2. ,  3. ]]])

In [12]:
np.identity(4, int)

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

In [13]:
np.eye(4, 5, dtype=np.int8)

array([[1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]], dtype=int8)

In [14]:
# vectorization
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [15]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [16]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [17]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])

In [18]:
arr2 > arr

array([[False,  True, False],
       [ True, False,  True]])

In [19]:
# broadcasting a value to a slice
arr = np.arange(10)
print(arr)
arr[5:8] = 12
print(arr)

[0 1 2 3 4 5 6 7 8 9]
[ 0  1  2  3  4 12 12 12  8  9]


In [20]:
# an array slice is a view, not a copy
ar_slice = arr[5:8]
print(ar_slice)
ar_slice[1] = 42
print(ar_slice, arr)

# bare slice assignment
ar_slice[:] = -1
print(ar_slice, arr)

[12 12 12]
[12 42 12] [ 0  1  2  3  4 12 42 12  8  9]
[-1 -1 -1] [ 0  1  2  3  4 -1 -1 -1  8  9]


In [21]:
# 2D array
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d, '\n\n', arr2d[2], '\n\n', arr2d[0][2], '==', arr2d[0, 2])

[[1 2 3]
 [4 5 6]
 [7 8 9]] 

 [7 8 9] 

 3 == 3


In [22]:
# 3D array
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(arr3d, '\n\n', arr3d[0])

# assign a scalar to sub-array in numpy array
backup = arr3d[0].copy()
print('broadcast of 42:')
arr3d[0] = 42
print(arr3d)

# assign an array to sub-array in numpy array
print('array assignment:')
arr3d[0] = backup
print(arr3d)

# slice from multidim array
print('values with indices starting with (1, 0):\n', arr3d[1,0], '==', arr3d[1][0])


[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]] 

 [[1 2 3]
 [4 5 6]]
broadcast of 42:
[[[42 42 42]
  [42 42 42]]

 [[ 7  8  9]
  [10 11 12]]]
array assignment:
[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]
values with indices starting with (1, 0):
 [7 8 9] == [7 8 9]


In [23]:
# 2D slicing
print(arr2d)
print('First two rows:')
print(arr2d[:2])
print('First two rows, from second column on:')
print(arr2d[:2, 1:])
print('Get a specific row - the second one - up to the second column:')
print(arr2d[1, :2])
print('Get a specific column - the third one - first two rows:')
print(arr2d[:2, 2])
print('All rows, first column only:')
print(arr2d[:, :1])


[[1 2 3]
 [4 5 6]
 [7 8 9]]
First two rows:
[[1 2 3]
 [4 5 6]]
First two rows, from second column on:
[[2 3]
 [5 6]]
Get a specific row - the second one - up to the second column:
[4 5]
Get a specific column - the third one - first two rows:
[3 6]
All rows, first column only:
[[1]
 [4]
 [7]]


In [24]:
# Boolean indexing (always leads to copy)
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)

print('Full data:')
print(data)
print('"Bob" data:')
print(data[names == 'Bob'])
print('"Bob" data from column 2 on:')
print(data[names == 'Bob', 2:])
print('"Bob" data, only from column 3:')
print(data[names == 'Bob', 3])
print('Not "Bob" data:')
print(data[names != 'Bob'])
print('Not "Bob" data, again:')
print(data[~(names == 'Bob')])
print('"Bob or Will" data:')
print(data[(names == 'Bob') | (names == 'Will')])  # careful with precedence

# broadcasting values through boolean indexing
print('Negative values defaulted to zero:')
data[data < 0] = 0
print(data)

Full data:
[[ 1.25655431  0.69117979 -0.36603784  0.3886745 ]
 [ 0.4503915   0.29649378  0.01557227 -1.63843521]
 [ 0.23849015 -0.07426864 -0.99157863  0.11937829]
 [-0.25257432 -0.3716524  -2.22789551  0.40396519]
 [ 0.59664815  1.7325264  -2.29762654  1.91035665]
 [ 2.51043723 -0.92332618 -0.86072633  0.29370507]
 [-2.0796022  -0.44730305 -0.31722299  0.37599277]]
"Bob" data:
[[ 1.25655431  0.69117979 -0.36603784  0.3886745 ]
 [-0.25257432 -0.3716524  -2.22789551  0.40396519]]
"Bob" data from column 2 on:
[[-0.36603784  0.3886745 ]
 [-2.22789551  0.40396519]]
"Bob" data, only from column 3:
[0.3886745  0.40396519]
Not "Bob" data:
[[ 0.4503915   0.29649378  0.01557227 -1.63843521]
 [ 0.23849015 -0.07426864 -0.99157863  0.11937829]
 [ 0.59664815  1.7325264  -2.29762654  1.91035665]
 [ 2.51043723 -0.92332618 -0.86072633  0.29370507]
 [-2.0796022  -0.44730305 -0.31722299  0.37599277]]
Not "Bob" data, again:
[[ 0.4503915   0.29649378  0.01557227 -1.63843521]
 [ 0.23849015 -0.07426864 -0.9

In [32]:
# Fancy indexing == indexing using integer arrays - the result is always a copy
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
print(arr)

print("Fancy indexing to select rows 4, 3, 0, 6:")
print(arr[[4, 3, 0, 6]])
print("Fancy indexing with negative indices:")
print(arr[[-3, -5, -7]])

arr = np.arange(32).reshape((8, 4))
print('\n',arr)
print('Picking elements passing their coordinates')
print('Get (1, 0), (5, 3), (7, 1), (2, 2):', arr[[1, 5, 7, 2], [0, 3, 1, 2]])

[[0. 0. 0. 0.]
 [1. 1. 1. 1.]
 [2. 2. 2. 2.]
 [3. 3. 3. 3.]
 [4. 4. 4. 4.]
 [5. 5. 5. 5.]
 [6. 6. 6. 6.]
 [7. 7. 7. 7.]]
Fancy indexing to select rows 4, 3, 0, 6:
[[4. 4. 4. 4.]
 [3. 3. 3. 3.]
 [0. 0. 0. 0.]
 [6. 6. 6. 6.]]
Fancy indexing with negative indices:
[[5. 5. 5. 5.]
 [3. 3. 3. 3.]
 [1. 1. 1. 1.]]

 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]
 [28 29 30 31]]
Picking elements passing their coordinates
Get (1, 0), (5, 3), (7, 1), (2, 2): [ 4 23 29 10]


In [36]:
# fancy indexing to select rows and columns in a specific order
print(arr)
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]
 [28 29 30 31]]


array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [52]:
# Transposing Arrays
arr = np.arange(15).reshape((3, 5))
print(arr)
print('Transposed to:')
print(arr.T)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
Transposed to:
[[ 0  5 10]
 [ 1  6 11]
 [ 2  7 12]
 [ 3  8 13]
 [ 4  9 14]]
