In [None]:
import numpy as np
import pandas as pd

### load dataset and extract a ndvector

In [None]:
rainfall = pd.read_csv('data/Seattle2014.csv')
rainfall.head(2)

In [None]:
inches_pd = (rainfall['PRCP'] / 254.).copy()
inches    = rainfall['PRCP'].values / 254.
print(type(inches),type(inches_pd),inches.shape,inches_pd.shape)

# Views, copies, and how to modifity them with awareness
https://www.practicaldatascience.org/html/views_and_copies_in_pandas.html
the issue came up while working on SiSensor optimisation..

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
#import seaborn; seaborn.set()  # set plot styles
plt.hist(inches, 20);

## intro ufuncs for comparisons over arrays
- Universal Functions that NumPy's ufuncs can be used in place of loops to do fast element-wise arithmetic operations on arrays
- in the same way, we can use other ufuncs to do element-wise comparisons over arrays, and we can then manipulate the results to answer the questions we have.

In [None]:
x = np.array([1, 2, 3, 4, 5])
x < 3  # less than 
# above it's a short for
np.less(x,3)

In [None]:
# <= also exists... 
np.less_equal(x,3)

In [None]:
x > 3  # less than
# above it's a short for
np.greater(x,3),'  ',np.greater_equal(x,3)

In [None]:
np.not_equal(x,3), ' ', np.equal(x,3)

It is also possible to do an element-wise comparison of two arrays, and to include compound expressions

In [None]:
(2 * x) == (x ** 2)
# which is the same as

np.equal((2 * x),(x ** 2))
# which is the same as

np.equal(  np.multiply(x,2)  ,  np.power(x,2)  )

## As in the case of arithmetic operators, the comparison operators are implemented as ufuncs in NumPy

Operator	Equivalent ufunc		Operator	Equivalent ufunc
==	np.equal		!=	np.not_equal
<	np.less		<=	np.less_equal
>	np.greater		>=	np.greater_equal

## Just as in the case of arithmetic ufuncs, these will work on arrays of any size and shape. Here is a two-dimensional example:

In [None]:
generator = np.random.RandomState(150777)
# which is the same as
generator = np.random.RandomState()
generator.seed(150777)

In [None]:
x = generator.randint(10,size=(3,4))
x

In [None]:
x < 5
# which is the same as
np.less(x,5)

# Working with Boolean Arrays

## Counting entries
To count the number of True entries in a Boolean array, np.count_nonzero is useful:

In [None]:
x < 6

In [None]:
# how many values less than 6?
np.count_nonzero(x < 6)

One can also use use np.sum; in this case, False is interpreted as 0, and True is interpreted as 1:

In [None]:
np.sum( x<6 )

In [None]:
# how many values less than 6 in each row?
np.sum(x < 6, axis=1)

In [None]:
np.sum(x<6, axis=0 )

In [None]:
# are there any values greater than 8?
np.any(x)

In [None]:
# are all values less than 10?
np.all(x<10)

In [None]:
# are all values less than 9, within each column
np.all(x<9,axis=0)

In [None]:
# are all values less than 9, within each row
np.all(x<9,axis=1)

## Boolean operators
We've already seen how we might count, say, all days with rain less than four inches, or all days with rain greater than two inches. But what if we want to know about all days with rain less than four inches and greater than one inch? This is accomplished through Python's bitwise logic operators, &, |, ^, and ~. Like with the standard arithmetic operators, NumPy overloads these as ufuncs which work element-wise on (usually Boolean) arrays.

For example, we can address this sort of compound question as follows:

In [None]:
len((inches > 0.5) & (inches < 1))

In [None]:
( pd.Series((inches > 0.5) & (inches < 1)) ).describe()

In [None]:
np.sum((inches > 0.5) & (inches < 1))

In [None]:
np.sum(~( (inches <= 0.5) | (inches >= 1) ))

In [None]:
print("Number days without rain:      ", np.sum(inches == 0))
print("Number days with rain:         ", np.sum(inches != 0))
print("Days with more than 0.5 inches:", np.sum(inches > 0.5))
print("Rainy days with < 0.2 inches  :", np.sum((inches > 0) &
                                                (inches < 0.2)))

## Boolean Arrays as Masks¶
In the preceding section we looked at aggregates computed directly on Boolean arrays. A more powerful pattern is to use Boolean arrays as masks, to select particular subsets of the data themselves. Returning to our x array from before, suppose we want an array of all values in the array that are less than, say, 5:

In [None]:
x

In [None]:
x<5
# which is the same as
np.less(x,5)

Now to select these values from the array, we can simply index on this Boolean array; this is known as a masking operation:

In [None]:
x [ x<5 ]

What is returned is a one-dimensional array filled with all the values that meet this condition; in other words, all the values in positions at which the mask array is True.

In [None]:
# construct a mask of all rainy days
rainy_days  = inches>0
rainy_days

days_of_year   = np.arange(1,366)
summer         = (days_of_year > 172) & (days_of_year < 262)
len(summer)

In [None]:
# statistics on different portions of year

u = np.median( inches[ rainy_days ] )
print("Median precip on rainy days in 2014 (inches):   ",u)

u = np.median( inches[ summer ] )
print("Median precip on summer days in 2014 (inches):  ",u)
plt.hist( inches[summer], bins=100 )

u = np.max( inches[ summer ] )
print("Maximum precip on summer days in 2014 (inches): ",u)

u = np.median( inches[ rainy_days & (~summer)  ] )
print("Median precip on non-summer rainy days (inches):",u)

## Aside: Using the Keywords and/or Versus the Operators &/|

One common point of confusion is the difference between the keywords and and or on one hand, and the operators & and | on the other hand. When would you use one versus the other?

The difference is this: and and or gauge the truth or falsehood of entire object, while & and | refer to bits within each object.

When you use and or or, it's equivalent to asking Python to treat the object as a single Boolean entity. In Python, all nonzero integers will evaluate as True

In [None]:
bool(42), bool(0)

In [None]:
bool(42 and 0)

In [None]:
bin(42),bin(51)

In [None]:
bin(42&51)

In [None]:
bin(34)

In [None]:
A = np.array([1, 0, 1, 0, 1, 0], dtype=bool)
print(A)
B = np.array([1, 1, 1, 0, 1, 1], dtype=bool)
print(B)
A | B

So remember this: and and or perform a single Boolean evaluation on an entire object, while & and | perform multiple Boolean evaluations on the content (the individual bits or bytes) of an object. For Boolean NumPy arrays, the latter is nearly always the desired operation.