In [22]:
from IPython.display import Image

--------------
## statistical functions
-----------------------

NumPy has quite a few useful statistical functions for finding minimum, maximum, percentile standard deviation and variance, etc. from the given elements in the array. 

#### Averages and variances

In [23]:
import numpy as np 

`numpy.median`

- Compute the median along the specified axis.

- Returns the median of the array elements.

In [2]:
a = np.array([[10, 7, 4], 
              [3,  2, 1], 
              [1,  2, 3]])
a

array([[10,  7,  4],
       [ 3,  2,  1],
       [ 1,  2,  3]])

In [6]:
np.median(a)

3.0

In [7]:
np.median(a, axis=0, keepdims=True)

array([[3., 2., 3.]])

In [5]:
np.median(a, axis=1, keepdims=True)

array([[7.],
       [2.],
       [2.]])

`numpy.average`

- Compute the weighted average along the specified axis.

In [8]:
data = list(range(1, 4))
data

[1, 2, 3]

In [9]:
np.average(data)

2.0

In [10]:
np.average(data, weights=[1,2,3])

2.3333333333333335

In [11]:
data = list(range(1, 3))
data

[1, 2]

In [12]:
np.average(data, weights=[1, 3])

1.75

In [13]:
np.average(data, weights=[1./4, 3./4])

1.75

In [14]:
np.average(data, weights=[.5,.5])

1.5

In [15]:
data = np.arange(6).reshape((3,2))
data

array([[0, 1],
       [2, 3],
       [4, 5]])

In [16]:
np.average(data, axis=1, weights=[1./4, 3./4])

array([0.75, 2.75, 4.75])

In [17]:
incorrect1     = np.array([10, 20])
sample_weight1 = [1, 1]

np.average(incorrect1, weights=sample_weight1, axis=0)

15.0

In [18]:
incorrect1     = np.array([10, 20])
sample_weight1 = [.1, .1]

np.average(incorrect1, weights=sample_weight1, axis=0)

15.0

In [19]:
incorrect1     = np.array([40, 20])
sample_weight1 = [.4, .8]

np.average(incorrect1, weights=sample_weight1, axis=0)

26.666666666666664

In [20]:
incorrect1     = np.array([True, 20])
sample_weight1 = [1, 2]

np.average(incorrect1, weights=sample_weight1, axis=0)

13.666666666666666

In [21]:
incorrect1     = np.array([True, False])
sample_weight1 = [.4, .8]

np.average(incorrect1, weights=sample_weight1, axis=0)

0.3333333333333333

In [22]:
incorrect1     = np.array([False, False])
sample_weight1 = [.4, .8]

np.average(incorrect1, weights=sample_weight1, axis=0)

0.0

`numpy.mean`

- Compute the arithmetic mean along the specified axis.

- Returns the average of the array elements. The average is taken over the flattened array by default, otherwise over the specified axis. 

- float64 intermediate and return values are used for integer inputs.

In [23]:
a = np.array([[1, 2], 
              [3, 4]])
a

array([[1, 2],
       [3, 4]])

In [24]:
np.mean(a)

2.5

In [25]:
np.mean(a, axis=0, keepdims=True)

array([[2., 3.]])

In [26]:
# Computing the mean in float64 is more accurate:

np.mean(a, dtype=np.float64)

2.5

`numpy.nanmean`

- Compute the arithmetic mean along the specified axis, ignoring NaNs.
- Returns the average of the array elements. 
- The average is taken over the flattened array by default, otherwise over the specified axis. 

In [6]:
a = np.array([[1, np.nan], [3, 4]])

In [7]:
np.nanmean(a)

2.6666666666666665

In [9]:
np.nanmean(a, axis=0)

array([2., 4.])

In [8]:
np.nanmean(a, axis=1)

array([1. , 3.5])

In [24]:
Image(r'D:\MYLEARN\2-ANALYTICS-DataScience\icons-images\stats-24.JPG', width=800)

<IPython.core.display.Image object>

In [27]:
x          = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, np.nan, 4, 28.0]

In [28]:
y, y_with_nan = np.array(x),  np.array(x_with_nan)

In [29]:
np.ptp(y), np.ptp(y_with_nan)

(27.0, nan)

Alternatively, you can use built-in Python, NumPy, or Pandas functions and methods to calculate the maxima and minima of sequences:

- max() and min() from the Python standard library
- amax() and amin() from NumPy
- nanmax() and nanmin() from NumPy to ignore nan values
- .max() and .min() from NumPy
- .max() and .min() from Pandas to ignore nan values by default

In [30]:
np.amax(y) - np.amin(y)

27.0

In [31]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

27.0

In [32]:
y.max() - y.min()

27.0

##### Variance
The sample variance quantifies the spread of the data. It shows numerically how far the data points are from the mean.

In [28]:
Image(r'D:\MYLEARN\2-ANALYTICS-DataScience\icons-images\stats-25.JPG', width=800)

<IPython.core.display.Image object>

In [30]:
Image(r'D:\MYLEARN\2-ANALYTICS-DataScience\icons-images\stats-26.JPG', width=800)

<IPython.core.display.Image object>

In [181]:
Image(r'D:\MYLEARN\2-ANALYTICS-DataScience\icons-images\stats-46.JPG', width=800)

<IPython.core.display.Image object>

There are two datasets in this figure:

- Green dots: This dataset has a smaller variance or a smaller average difference from the mean. It also has a smaller range or a smaller difference between the largest and smallest item.
- White dots: This dataset has a larger variance or a larger average difference from the mean. It also has a bigger range or a bigger difference between the largest and smallest item.

> Note that these two datasets have the same mean and median, even though they appear to differ significantly. Neither the mean nor the median can describe this difference. 

> That’s why you need the measures of variability.

In [33]:
n = len(x)

In [34]:
mean_ = sum(x) / n

In [37]:
var_ = sum((item - mean_)**2 for item in x) / (n )
var_

98.55999999999999

In [36]:
np.var(x)

98.55999999999999

#### Standard Deviation

- The sample `standard deviation` is another measure of data spread. 

- It’s connected to the sample variance, as standard deviation, 𝑠, is the positive square root of the sample variance. 

- The standard deviation is often more convenient than the variance because it `has the same unit as the data points`.

In [31]:
Image(r'D:\MYLEARN\2-ANALYTICS-DataScience\icons-images\stats-27.JPG', width=800)

<IPython.core.display.Image object>

`numpy.std`
- Compute the standard deviation along the specified axis.

- Returns the standard deviation, a measure of the spread of a distribution, of the array elements. The standard deviation is computed for the flattened array by default, otherwise over the specified axis.

In [41]:
x          = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, np.nan, 4, 28.0]

In [42]:
n = len(x)

In [43]:
mean_ = sum(x) / n

In [44]:
var_ = sum((item - mean_)**2 for item in x) / (n )
var_

98.55999999999999

In [45]:
np.var(x)

98.55999999999999

In [46]:
std_ = var_ ** 0.5
std_

9.927738916792684

In [47]:
np.std(x)

9.927738916792684

one more example..

In [11]:
a = np.array([[1, 2], 
              [3, 4]])
a

array([[1, 2],
       [3, 4]])

In [12]:
np.std(a)

1.118033988749895

In [13]:
np.std(a, axis=0)

array([1., 1.])

In [14]:
np.std(a, axis=1)

array([0.5, 0.5])

`numpy.nanstd`

- Compute the standard deviation along the specified axis, while ignoring NaNs.

- Returns the standard deviation, a measure of the spread of a distribution, of the non-NaN array elements. The standard deviation is computed for the flattened array by default, otherwise over the specified axis.

In [15]:
a = np.array([[1, np.nan], [3, 4]])
np.nanstd(a)

1.247219128924647

In [16]:
np.nanstd(a, axis=0)

array([1., 0.])

In [17]:
np.nanstd(a, axis=1)

array([0. , 0.5])

`numpy.std`
- Compute the standard deviation along the specified axis.

- Returns the standard deviation, a measure of the spread of a distribution, of the array elements. The standard deviation is computed for the flattened array by default, otherwise over the specified axis.



In [18]:
a = np.array([[1, 2], [3, 4]])
np.std(a)

1.118033988749895

In [19]:
np.std(a, axis=0)

array([1., 1.])

In [20]:
np.std(a, axis=1)

array([0.5, 0.5])

`numpy.nanstd`
- Compute the standard deviation along the specified axis, while ignoring NaNs.

- Returns the standard deviation, a measure of the spread of a distribution, of the non-NaN array elements. The standard deviation is computed for the flattened array by default, otherwise over the specified axis.

In [21]:
a = np.array([[1, np.nan], [3, 4]])
np.nanstd(a)

1.247219128924647