## Lesson 13: Variability

In [1]:
import numpy as np

In [2]:
# NumPy arrays are great for doing math operations over every element
data = np.array([33219,36254,38801,46335,46840,47596,55130,56863,78070,88830])

mean = np.mean(data) # calculate the mean
print('The mean is {:g}'.format(mean))

The mean is 52793.8


In [3]:
devs = data - mean # deviations from the mean
print(devs)

[-19574.8 -16539.8 -13992.8  -6458.8  -5953.8  -5197.8   2336.2   4069.2
  25276.2  36036.2]


In [4]:
avg_dev = np.sum(devs)/len(data) # calculate the average deviation

# the number we get is practically zero
print('The average deviation equals {:.15f}'.format(avg_dev))

The average deviation equals -0.000000000002910


In [5]:
abs_devs = np.absolute(devs) # convert to absolute values
print(abs_devs)

[19574.8 16539.8 13992.8  6458.8  5953.8  5197.8  2336.2  4069.2 25276.2
 36036.2]


In [6]:
avg_absdev = np.sum(abs_devs)/len(data)
print('The average absolute deviation equals {:.2f}'.format(avg_absdev))

The average absolute deviation equals 13543.56


In [7]:
sq_devs = devs**2 # calculate squared deviations
print(sq_devs)

[3.83172795e+08 2.73564984e+08 1.95798452e+08 4.17160974e+07
 3.54477344e+07 2.70171248e+07 5.45783044e+06 1.65583886e+07
 6.38886286e+08 1.29860771e+09]


In [8]:
variance = np.sum(sq_devs)/len(data)
print('The average squared deviation (variance) equals {:.2f}'.format(variance))

print('The standard deviation equals {:.2f}'.format(np.sqrt(variance)))

The average squared deviation (variance) equals 291622740.36
The standard deviation equals 17076.97


### What if we write a function? 🤔

In [9]:
def calculate_std(array):
    """This function takes in data in the form of a NumPy array and returns
    its standard deviation
    """
    mean = np.mean(array)
    devs = array - mean
    sq_devs = devs**2
    variance = np.sum(sq_devs)/len(array)
    std = np.sqrt(variance)
    return std

In [10]:
# the dataset from Quiz: SD Social Networkers
sna_data = np.array([38946,43420,49191,50430,50557,52580,53595,54135,60181,62076])

result = calculate_std(sna_data)
print('The standard deviation equals {:.2f}'.format(result))

The standard deviation equals 6557.16


### How did we do?

We can check if our calculations are correct by using NumPy's built-in function that computes std - 

In [11]:
print('The standard deviation equals {:.2f}'.format(np.std(sna_data)))

The standard deviation equals 6557.16


### Quiz: Spreadsheet SD

In [12]:
from numpy import genfromtxt

In [13]:
salaries = genfromtxt('datasets/SNA_salaries.csv', delimiter = ',')
salaries

array([     nan, 59147.29, 61379.14, 55683.19, 56272.76, 52055.88,
       47696.74, 60577.53, 49793.44, 35562.29, 58586.76, 47091.37,
       36906.96, 53479.66, 67834.74, 53018.8 , 60375.11, 36566.91,
       52905.58, 51063.31, 65431.26, 57071.83, 30060.59, 42619.62,
       52984.77, 57871.28, 41274.37, 24497.78, 47939.82, 42755.52,
       57189.35, 37216.45, 44742.99, 47119.04, 59269.48, 53336.8 ,
       39719.54, 69473.2 , 39831.55, 58300.7 , 41726.66, 40283.35,
       59652.4 , 40326.61, 28167.31, 51420.36, 55294.22, 48116.14,
       36780.47, 53628.89, 48782.09, 33615.77, 41881.34, 64745.33,
       53482.58, 48838.54, 57031.73, 62821.03, 60627.78, 46568.52,
       38977.05, 43250.62, 67502.5 , 54696.18, 43003.14, 29156.83,
       61230.07, 56749.93, 48373.77, 52428.26, 29961.91, 54524.28,
       83017.28, 49290.55, 56375.66, 64032.27, 52947.6 , 61210.22,
       54438.94, 48825.68, 54118.71, 45305.73, 42361.59, 52852.52,
       62933.52, 64330.23, 48922.74, 27211.96, 62409.65, 28981

In [14]:
salaries = salaries[~np.isnan(salaries)] # remove NaNs
salaries

array([59147.29, 61379.14, 55683.19, 56272.76, 52055.88, 47696.74,
       60577.53, 49793.44, 35562.29, 58586.76, 47091.37, 36906.96,
       53479.66, 67834.74, 53018.8 , 60375.11, 36566.91, 52905.58,
       51063.31, 65431.26, 57071.83, 30060.59, 42619.62, 52984.77,
       57871.28, 41274.37, 24497.78, 47939.82, 42755.52, 57189.35,
       37216.45, 44742.99, 47119.04, 59269.48, 53336.8 , 39719.54,
       69473.2 , 39831.55, 58300.7 , 41726.66, 40283.35, 59652.4 ,
       40326.61, 28167.31, 51420.36, 55294.22, 48116.14, 36780.47,
       53628.89, 48782.09, 33615.77, 41881.34, 64745.33, 53482.58,
       48838.54, 57031.73, 62821.03, 60627.78, 46568.52, 38977.05,
       43250.62, 67502.5 , 54696.18, 43003.14, 29156.83, 61230.07,
       56749.93, 48373.77, 52428.26, 29961.91, 54524.28, 83017.28,
       49290.55, 56375.66, 64032.27, 52947.6 , 61210.22, 54438.94,
       48825.68, 54118.71, 45305.73, 42361.59, 52852.52, 62933.52,
       64330.23, 48922.74, 27211.96, 62409.65, 28981.92, 64913

In [15]:
SNA_std = calculate_std(salaries)
print('The standard deviation equals {:.2f}'.format(SNA_std))

The standard deviation equals 10656.95


### Bessel's Correction

In [16]:
sample = np.array([18,21,15,18,17,21,22,23,20])

# calculate standard deviation for the sample data; modify the calculate_std() function
# if you want to get an estimate for the population standard deviation