## NumPy Basics: Arrays and Vectorised Computation

NumPy short for Numerical Python:
- ndarray, multidimensional array
- mathematical function for fast operations on entire arrays of data without writing loops
- tools for reading/writing data to disk/ memory-mapped files
- linear algebra, random number generation, fourier transform
- easy to use AC API to pass data to external libraries

Numpy:
- has no modeling or scientific functionality
- has array-oriented computing to use tools like pandas

- internally stores data in a continguous block of memory
- perform complex computation on entire arrays

In [None]:
#Performance difference between NumPy array and Python list:

import numpy as np
my_arr = np.arange(1000000)
mylist = list(range(1000000))

#multiply each sequence by 2:
#this is textbook answer, never use %time before

In [10]: %time for _ in range(10): 
        my_arr2 = my_arr * 2 
        CPU times: user 20 ms, sys: 50 ms, total: 70 ms 
        Wall time: 72.4 ms
            
In [11]: %time for _ in range(10): 
        my_list2 = [x * 2 for x in my_list] 
        CPU times: user 760 ms, sys: 290 ms, total: 1.05 s 
        Wall time: 1.05 s 

## 4.1 The NumPy ndarray: A Multidimensional Array Object

In [4]:
#Generate some random data

data = np.random.randn(2,3)
data

#2 rows and 3 columns

array([[-0.12906644,  1.72042   , -0.70344464],
       [-0.4275145 , -2.09349478, -0.35925084]])

In [6]:
#write mathematical operations with data

data * 10

array([[ -1.29066439,  17.20419997,  -7.03444642],
       [ -4.27514499, -20.93494783,  -3.59250841]])

In [7]:
data + data

array([[-0.25813288,  3.44083999, -1.40688928],
       [-0.855029  , -4.18698957, -0.71850168]])

In [8]:
# ndarray is a generic multidimensional container

data.shape

#2 rows and 3 columns

(2, 3)

In [9]:
data.dtype

#type of data = float? string? int?

dtype('float64')

In [10]:
#Create ndarrays using array function
#can convert list to array

data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)

arr1

array([6. , 7.5, 8. , 0. , 1. ])

In [11]:
#Nested list convert to multi-dimensional array

data2 = [[1,2,3,4], [5,6,7,8]]
arr2 = np.array(data2)
arr2

#2 sets, with 1 row, 4 columns

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [12]:
arr2.dtype

dtype('int32')

In [13]:
#arr2 has two dimensions with shape inferred from the data. 
#as data2 was a list of lists
arr2.shape

(2, 4)

In [14]:
#arr2 has two dimensions with shape inferred from the data. 
#as data2 was a list of lists
arr2.ndim

2

In [15]:
arr1.dtype

dtype('float64')

In [16]:
arr2.dtype

dtype('int32')

In [17]:
#zeros and ones create arrays of 0s or 1s
#with given lenghth or shape

np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
#zeros and ones create arrays of 0s or 1s
#with given lenghth or shape

np.zeros((3,6))

#use a tuple

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [23]:
#empty creates an array 
#without initializing its values to any particular value.

np.empty((2,3,2))

# 2 sets, 3 rows, 2 columns
#not safe to assume that np.empty will return an array of all zeros. 
#In some cases, it may return uninitialized “garbage” values.
# ==> below


array([[[7.63877737e-312, 2.47032823e-322],
        [0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 1.58817677e-052]],

       [[6.50358915e-091, 9.73701415e-072],
        [5.50778592e+169, 6.28542758e-038],
        [3.99910963e+252, 4.93432906e+257]]])

In [24]:
#arange is an array-valued version of the built-in Python range function:

np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [25]:
#Data Types for ndarrays

arr1 = np.array([1,2,3], dtype = np.float64)

arr2 = np.array([1,2,3], dtype = np.int32)

arr1.dtype


dtype('float64')

In [26]:
arr2.dtype

dtype('int32')

In [2]:
#numerical dtypes are named like float, int 
#and followed by a number indicating the number of bits per element

#explicitly convert or cast an arrat from 1 dtype to another
#using astype method

import numpy as np

arr = np.array([1,2,3,4,5])
arr.dtype

dtype('int32')

In [4]:
#astype method

float_arr = arr.astype(np.float64)
float_arr.dtype

dtype('float64')

In [5]:
#Example if integers were cast to floating point
#decimals will be truncates
#float to integers using .astype()

arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
arr

array([ 3.7, -1.2, -2.6,  0.5, 12.9, 10.1])

In [6]:
arr.astype(np.int32)
#all the decimals are gone

array([ 3, -1, -2,  0, 12, 10])

In [8]:
# .astype()
#if array of strings represent numbers, can convert

numeric_strings = np.array(['1.25','-9.6','42'], dtype=np.string_)
numeric_strings

#must be cautious when using numpy.string_ type
#string data in Numpy is fixed size and truncate without warning

array([b'1.25', b'-9.6', b'42'], dtype='|S4')

In [9]:
numeric_strings.astype(float) #because lazy

array([ 1.25, -9.6 , 42.  ])

In [10]:
#another way of writing
numeric_strings = np.array(['1.25','-9.6','42'])
numeric_strings.astype(float)

array([ 1.25, -9.6 , 42.  ])

In [11]:
#can use another array's dtype attribute to cast dtype

int_array = np.arange(10)                     #list of integers
calibers = np.array([.22, .34, .50, .357])    #list of float

int_array.astype(calibers.dtype)
#they become float too
#astype always creates a new array, even if the new dtype is same as old dtype

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [16]:
#shorthard type code strings to refer to a dtype

empty_uint32 = np.empty(8, dtype ='u4')
empty_uint32

array([0, 0, 0, 0, 0, 0, 0, 0], dtype=uint32)

In [18]:
#Arithmetic with NumPy Arrays

#batch operations on array without using for loops
#vectorisation

arr = np.array([[1. , 2., 3.],[4. , 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [19]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [20]:
arr - arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [21]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [22]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [23]:
arr2 = np.array([[0., 4., 1.,], [7., 2., 12.]])
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [24]:
arr2 > arr

#this is comparison between same sized arrays 

#operations between different sized arrays is called broadcasting

array([[False,  True, False],
       [ True, False,  True]])

In [26]:
#Basic Indexing and SLicing

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
arr[5]
#select index 5

5

In [28]:
arr_slice = arr[5:8]

#slice out index 5, 6 & 7 

array([5, 6, 7])

In [31]:
arr[5:8] = 12

#replace index 5, 6 & 7 with 12

In [32]:
arr

#reflected in the source array
#inde x 5, 6 & 7 with 12 

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [34]:
arr_slice = arr[5:8]
arr_slice

# arr_slice comes from arr source

array([12, 12, 12])

In [35]:
#when change the values of arr_slice
#mutations reflected in arr

arr_slice[1] = 12345
#index 1 of index 0, 1, 2 replaced from 12 to 12345
# 12, 12345, 12

arr
#reflected in source

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,
           9])

In [36]:
arr_slice[:] = 64
#the entire string is changed to 64
#from 12, 12345, 12 changed to 64, 64, 64

arr
#reflected in source

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [40]:
#if want a copy of slice if ndarray use .copy()
arr3 = arr[5:8].copy()
arr3[:] = 22
print("arr3 = ", arr3, 'using .copy() ')

arr
print("arr = ", arr, 'source is the same, 64, 64 ,64')

arr3 =  [22 22 22] using .copy() 
arr =  [ 0  1  2  3  4 64 64 64  8  9] source is the same, 64, 64 ,64


In [41]:
#higher dimensional arrays
# 2-d arrays, element at each infect are not scalar but 1-d array

arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])

arr2d[2]
#choose the 2nd index set from, 0, 1, 2 index

array([7, 8, 9])

In [42]:
#individual elements are accessed recursively

arr2d[0][2]

#choose 0 index set and the 2 index column since only 1 row

3

In [43]:
#individual elements are accessed recursively
#use tuple format

arr2d[0,2]
#choose 0 index set and the 2 index column since only 1 row

3

In [44]:
#in multidimensional arrays
#omit later indices

arr3d = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [45]:
arr3d[0]

#choose 0 index set

array([[1, 2, 3],
       [4, 5, 6]])

In [46]:
old_values = arr3d[0].copy()
#creating a copy without affecting the source array

arr3d[0] = 42
#convert entire 0 index set to 42

arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [47]:
arr3d[0] = old_values
# now arr3d[0] is the source code and transfer to old_valyes

arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [48]:
arr3d[1,0]
# select 1 index set and next 0 set row

array([7, 8, 9])

In [50]:
#Indexing in 2 steps
x = arr3d[1]
x

array([[ 7,  8,  9],
       [10, 11, 12]])

In [51]:
x[0]

array([7, 8, 9])

In [52]:
#Indexing with alices

# ndarrays can be sliced just like python lists

arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [53]:
# slicing 1-d array
arr[1:6]

#select index 1, 2 ,3 ,4 ,5 

array([ 1,  2,  3,  4, 64])

In [54]:
#slicing 2-d array is different

arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [55]:
arr2d[:2]

#slice set index 0 & 1 instead of 2

array([[1, 2, 3],
       [4, 5, 6]])

In [56]:
arr2d[:2, 1:]

#slice set index 0 & 1 instead of 2
#in the row slice column index 1 onwards

array([[2, 3],
       [5, 6]])

In [57]:
arr2d[1, : 2]

#slice set index 1
#slice column 0,1

array([4, 5])

In [58]:
arr2d[:2,2]

array([3, 6])

In [59]:
arr2d[:,:1]

#choose all sets but only index 0

array([[1],
       [4],
       [7]])

In [60]:
arr2d[:2, 1:] = 0
arr2d

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

In [62]:
#Boolean Indexing

#consider an example:
# data in an array and an array of names with duplicates

names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

data = np.random.randn(7, 4)

names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [63]:
data

array([[-0.90348404, -1.80356343, -2.02068893, -0.31465935],
       [-0.00244133, -0.22006287,  1.03136571,  1.94008976],
       [-1.44987644,  0.21653669, -0.60262569,  1.65677554],
       [-0.11687119, -0.56795246,  0.50862295, -1.26694324],
       [-0.86597802,  0.05631247,  0.61734096,  0.62169134],
       [-0.88758814, -1.08349708, -0.1869806 , -0.29709922],
       [-0.46146129,  0.263424  , -0.09751778, -0.93508853]])

In [64]:
names == 'Bob'

array([ True, False, False,  True, False, False, False])

In [65]:
data[ names == 'Bob']

array([[-0.90348404, -1.80356343, -2.02068893, -0.31465935],
       [-0.11687119, -0.56795246,  0.50862295, -1.26694324]])

In [67]:
#select from the rows where names == 'Bob' and index the columns too

data[ names == 'Bob', 2:]

#slice column index 2 onwards

array([[-2.02068893, -0.31465935],
       [ 0.50862295, -1.26694324]])

In [68]:
data[names == 'Bob', 3]

#slice column index 3 onwards

array([-0.31465935, -1.26694324])

In [69]:
#To select everything by 'Bob' use != or negate condition with ~

names != 'Bob'

array([False,  True,  True, False,  True,  True,  True])

In [74]:
data[names != 'Bob']

# use !=

array([[-0.00244133, -0.22006287,  1.03136571,  1.94008976],
       [-1.44987644,  0.21653669, -0.60262569,  1.65677554],
       [-0.86597802,  0.05631247,  0.61734096,  0.62169134],
       [-0.88758814, -1.08349708, -0.1869806 , -0.29709922],
       [-0.46146129,  0.263424  , -0.09751778, -0.93508853]])

In [75]:
data[ ~(names == 'Bob')]

#use ~ to negate the condition

array([[-0.00244133, -0.22006287,  1.03136571,  1.94008976],
       [-1.44987644,  0.21653669, -0.60262569,  1.65677554],
       [-0.86597802,  0.05631247,  0.61734096,  0.62169134],
       [-0.88758814, -1.08349708, -0.1869806 , -0.29709922],
       [-0.46146129,  0.263424  , -0.09751778, -0.93508853]])

In [77]:
cond = names == 'Bob'
#assign the condition as name == 'Bob'

data[~cond]

array([[-0.00244133, -0.22006287,  1.03136571,  1.94008976],
       [-1.44987644,  0.21653669, -0.60262569,  1.65677554],
       [-0.86597802,  0.05631247,  0.61734096,  0.62169134],
       [-0.88758814, -1.08349708, -0.1869806 , -0.29709922],
       [-0.46146129,  0.263424  , -0.09751778, -0.93508853]])

In [79]:
#use 2 of the 3 names to combine multiple boolean conditions 
#and use operators like &(and) and |(or)

mask = (names =='Bob') | (names =='Will')

mask

array([ True, False,  True,  True,  True, False, False])

In [80]:
data[mask]

array([[-0.90348404, -1.80356343, -2.02068893, -0.31465935],
       [-1.44987644,  0.21653669, -0.60262569,  1.65677554],
       [-0.11687119, -0.56795246,  0.50862295, -1.26694324],
       [-0.86597802,  0.05631247,  0.61734096,  0.62169134]])

In [81]:
#convert the arrays in the 
data[ data < 0] = 0

data

array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.03136571, 1.94008976],
       [0.        , 0.21653669, 0.        , 1.65677554],
       [0.        , 0.        , 0.50862295, 0.        ],
       [0.        , 0.05631247, 0.61734096, 0.62169134],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.263424  , 0.        , 0.        ]])

In [82]:
data[names != 'Joe'] = 7

data

#these types of operations on a 2-d data are applicable to pandas

array([[7.        , 7.        , 7.        , 7.        ],
       [0.        , 0.        , 1.03136571, 1.94008976],
       [7.        , 7.        , 7.        , 7.        ],
       [7.        , 7.        , 7.        , 7.        ],
       [7.        , 7.        , 7.        , 7.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.263424  , 0.        , 0.        ]])

In [2]:
#Fancy Indexing
#a term adopted by NumPy to describe indexing using integer arrays
#example use 8 x 4 array

import numpy as np
arr = np.empty((8,4))

for i in range(8):
    arr[i] = i
    
#for each row, value of row = number of row
# 0 onwards

arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [3]:
#To select a subset of rows in a particular order, 
#pass a list or ndarray of integers specifying the desired order

arr[[4, 3, 0, 6]]

#to choose a subset of rows in a particular order,
#pass a list specifying desired order

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.],
       [0., 0., 0., 0.],
       [6., 6., 6., 6.]])

In [4]:
arr[4, 3, 0, 6]

#this is not slicing, 

IndexError: too many indices for array

In [5]:
arr[4, 3]

#this is selecting the position

4.0

In [6]:
arr[[4,3]]

#but if select row, need double [[]]

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.]])

In [7]:
#using negative indices select rows from the end

arr[[ -3, -5, -7]]

array([[5., 5., 5., 5.],
       [3., 3., 3., 3.],
       [1., 1., 1., 1.]])

In [8]:
#Passing multiple index array does something different
#1- dimensional array of elements corresponding to tuple of indices

arr = np.arange(32).reshape((8,4))
#a range from 0 to 31, there are 32 values
#shape in 8 rows and 4 columns

arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [9]:
#choose the rows from the arr above

arr[[1,5,7,2], [0,3,1,2]]

#choose position 1,0 & 5,3 &7,1 & 2,2
#this is a 1-dimensional output

array([ 4, 23, 29, 10])

In [10]:
#fancy indexing and select rows

arr[[1,5,7,2]][:, [0,3,1,2]]

#the output will be rows

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [11]:
#Transposing Arrays and Swaping Axes

#return a view without copying anything
#Arrays have transpose method .T()

arr = np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [12]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [13]:
#np.dot = for computing the inner matrix product 

#product dependant on the matrix size
np.dot(arr.T, arr)

array([[125, 140, 155, 170, 185],
       [140, 158, 176, 194, 212],
       [155, 176, 197, 218, 239],
       [170, 194, 218, 242, 266],
       [185, 212, 239, 266, 293]])

In [14]:
np.dot(arr, arr.T)

#product dependant on the matrix size

array([[ 30,  80, 130],
       [ 80, 255, 430],
       [130, 430, 730]])

In [16]:
arr = np.random.randn(6,3)

arr

array([[ 1.29888178,  0.63954324,  1.95862981],
       [ 2.29658378, -0.62954626,  0.66451046],
       [ 0.90706131,  0.58967567, -0.5717512 ],
       [-0.33555653, -0.97481843,  0.52880092],
       [ 1.45568239,  1.20222261, -0.36093178],
       [-0.77886582,  0.35462422, -0.39921811]])

In [17]:
np.dot(arr.T, arr)

array([[10.62239253,  1.72071362,  3.15961183],
       [ 1.72071362,  3.67442997, -0.59383703],
       [ 3.15961183, -0.59383703,  5.17398156]])

In [18]:
#transpose() will accept tuple of axis numbers 
#to permute the axes for exta mind-bending

arr = np.arange(16).reshape((2,2,4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [22]:
arr.transpose((1,0,2))

#transpose set 1, row 

#axes have been reordered with 2nd axis first, 
#first axis second and last axis unchanged

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [30]:
#transposing with .T has a special case of swaping axes
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [33]:
#ndarray has the method swapazes which takes a pair of axis numbers 
#and switches indicated axes
arr.swapaxes(1,2)



array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])