In [2]:
from IPython.display import Image

In [3]:
import numpy as np

-------------------------------
#### Masking and Filtering
--------------------

`Index-based` selection is great, but what if we want to filter data based on more complicated `nonuniform` or `nonsequential` criteria? 

This is where the concept of a `mask` comes into play.

A `mask` is an array that has the `exact same shape as your data`, but instead of your values, it `holds Boolean values: either True or False`. 

You can use this mask array to index into your data array in `nonlinear and complex` ways. 

> It will `return` all of the elements where the Boolean array has a `True` value.

In [4]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
arr[5:8]

array([5, 6, 7])

In [6]:
# broadcasting a value to multiple array positions
arr[5:8] = 12

In [7]:
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [9]:
arr_slice = arr[5: 8]
arr_slice

array([12, 12, 12])

In [10]:
# change values in arr_slice, the mutations are reflected in the original 
# array arr
arr_slice[0] = 12345

In [11]:
arr_slice

array([12345,    12,    12])

In [12]:
arr

array([    0,     1,     2,     3,     4, 12345,    12,    12,     8,
           9])

In [13]:
# using : only means all values
arr_slice[1:2] = 9999

In [14]:
arr_slice

array([12345,  9999,    12])

In [15]:
arr

array([    0,     1,     2,     3,     4, 12345,  9999,    12,     8,
           9])

if you just assign a `portion of an array` to another array, the new array you just 
created actually `refers` to the parent array in memory.

That means, if you make any changes to the new array, it will reflect in the parent 
array as well.

So to avoid disturbing the parent array, you need to make a copy of it using copy(). 
All numpy arrays come with the `copy()` method.

##### deep copy method

In [16]:
arr_slice = arr[5:8].copy()

In [17]:
arr_slice

array([12345,  9999,    12])

In [18]:
arr_slice[:] = 888

In [19]:
arr_slice

array([888, 888, 888])

In [20]:
# the original array is not changed
arr

array([    0,     1,     2,     3,     4, 12345,  9999,    12,     8,
           9])

##### Boolean Indexing

In [21]:
names = np.array(['Rajat', 'Maruthi', 'Dinesh', 'Rajat', 'Bhanu', 'Viswa', 'Kamal'])

data  = np.random.randn(7, 4)

In [22]:
data

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.8651274 ,  0.98672865, -0.22567392, -0.79323477],
       [-1.44939295, -0.35304983,  0.35458012, -0.53499511],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018],
       [-0.98431614,  0.01957366, -0.69412815, -0.64175318],
       [-0.71060777, -2.12626259,  0.48596335,  0.85243231],
       [-0.75031346, -0.79924159, -0.00770832,  0.85942873]])

In [24]:
# Suppose each name corresponds to a row in the data array and we wanted to 
# select all the rows with corresponding name 'Rajat'.
names == 'Rajat'

array([ True, False, False,  True, False, False, False])

In [25]:
data[names == 'Rajat']

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018]])

> The boolean array must be of the `same length` as the array axis it’s indexing.

In [19]:
# what happens if they are of different lengths
names = np.array(['Rajat', 'Maruthi', 'Rajat', 'Bhanu', 'Viswa' ])
data = np.random.randn(7, 4)

In [20]:
data[names == 'Rajat']

IndexError: boolean index did not match indexed array along dimension 0; dimension is 7 but corresponding boolean dimension is 5

In [21]:
names = np.array(['Rajat', 'Maruthi', 'Dinesh', 'Rajat', 'Bhanu', 'Viswa', 'Kamal'])
data = np.random.randn(7, 4)
data

array([[-0.78117932, -1.87714817, -0.11804054,  0.71183517],
       [ 2.0135055 , -0.63173415,  0.24951605, -1.04795368],
       [-1.03342082, -1.09557775, -3.24943899,  0.27256337],
       [-0.44104784,  0.91402486, -1.57132968, -1.39076858],
       [ 0.73530958,  0.48299097, -0.07310301, -1.36678906],
       [ 0.36372633,  0.03196181, -1.36970052, -0.30487314],
       [-0.76656998,  0.36283378,  0.11567745, -0.84707846]])

In [26]:
data[names == 'Kamal']

array([[-0.75031346, -0.79924159, -0.00770832,  0.85942873]])

In [27]:
data[names == 'Kamal', :2]

array([[-0.75031346, -0.79924159]])

In [28]:
data[names == 'Kamal', 3]

array([0.85942873])

To select everything but 'Rajat', you can either use != or negate the condition using ~

In [29]:
names != 'Kamal'

array([ True,  True,  True,  True,  True,  True, False])

In [30]:
data[names != 'Kamal']

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.8651274 ,  0.98672865, -0.22567392, -0.79323477],
       [-1.44939295, -0.35304983,  0.35458012, -0.53499511],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018],
       [-0.98431614,  0.01957366, -0.69412815, -0.64175318],
       [-0.71060777, -2.12626259,  0.48596335,  0.85243231]])

In [31]:
data[~(names == 'Kamal')]

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.8651274 ,  0.98672865, -0.22567392, -0.79323477],
       [-1.44939295, -0.35304983,  0.35458012, -0.53499511],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018],
       [-0.98431614,  0.01957366, -0.69412815, -0.64175318],
       [-0.71060777, -2.12626259,  0.48596335,  0.85243231]])

In [32]:
cond = names == 'Kamal'

In [33]:
data[cond]

array([[-0.75031346, -0.79924159, -0.00770832,  0.85942873]])

In [34]:
data[~cond]

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.8651274 ,  0.98672865, -0.22567392, -0.79323477],
       [-1.44939295, -0.35304983,  0.35458012, -0.53499511],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018],
       [-0.98431614,  0.01957366, -0.69412815, -0.64175318],
       [-0.71060777, -2.12626259,  0.48596335,  0.85243231]])

Selecting two of the three names to combine multiple boolean conditions, 
use boolean arithmetic operators like & (and) and | (or):

In [35]:
cond = (names == 'Rajat') | (names == 'Bhanu')

In [36]:
data[cond]

array([[-0.36416788, -1.17365193,  0.85303706,  0.74417769],
       [ 0.99866008, -1.60963783, -2.09894002,  0.03543018],
       [-0.98431614,  0.01957366, -0.69412815, -0.64175318]])

In [37]:
cond = data > 0

In [38]:
cond

array([[False, False, False,  True],
       [ True, False,  True, False],
       [False, False, False,  True],
       [False,  True, False, False],
       [ True,  True, False, False],
       [ True,  True, False, False],
       [False,  True,  True, False]])

In [39]:
data[cond]

array([0.71183517, 2.0135055 , 0.24951605, 0.27256337, 0.91402486,
       0.73530958, 0.48299097, 0.36372633, 0.03196181, 0.36283378,
       0.11567745])