In [1]:
# Importing all the important libraries
import numpy as np
import matplotlib.pyplot as plt
from statistics import *
from IPython import display
%matplotlib inline

# Measure Of Spread 
A measure of spread, sometimes also called a measure of dispersion, is used to describe the variability in a sample or population. It is usually used in conjunction with a measure of central tendency, such as the mean or median, to provide an overall description of a set of data.

### Four Concepts in Measures Of Spread
#### 1.  Range
> The Range tells you how much is in between the lowest value (start value) and highest value (end value). 
#### 2. Quartiles
> Quartiles are values that split your data into quarters. The lowest quartile is called the lower quartile, and the highest     quartile is called the upper quartile. The middle quartile is the median
#### 3. Variance
> The variance is a way of measuring spread, and it’s the average of the distance of values from the mean squared.
#### 4. Standard Deviation
> the standard deviation is a measure of how spread out data is around center of the distribution (the mean). It also gives you an idea of where, percentage wise, a certain value falls.

#### Calculation of Range
From the definition itself it is clear that range is difference of the highest and smallest values present in a dataset.
So, 
>            Range = MaxValue(dataset) - MinValue(dataset)

where MaxValue() - is a function which returns maximum value of the dataset 
& MinValue() - is a function which returns minimum value of the dataset.

### Finding Range

In [2]:
#Example of finding range of a dataset of 45 observations
data_points = np.random.randn(45)
set_range = np.max(data_points) - np.min(data_points)     #MaxValue(data_points) - MinValue(data_points)
print("Data Set = ", data_points)
print("Range of Data Set = ", set_range)

Data Set =  [-0.42617473  0.27485236 -0.84926293  0.58768691 -1.38262185 -0.86249528
 -0.36253823  0.17428053 -1.34592234  0.3093388   1.61480576 -1.16187222
 -1.2132452  -0.35806265 -0.1183731  -1.05098409 -0.44999994 -1.28784661
  1.84024941  0.66125158  1.12438327 -0.34569865 -1.25854231 -1.51192194
 -0.11763739  0.12226942  0.05013383  1.5841762   0.052215   -0.5991119
  0.95427726 -0.45279653 -1.16757157  1.16555828 -2.56053559 -0.98436771
  1.82028622  0.01847426  0.65002655  0.04017848  0.59193735  0.87426157
  0.2330394  -0.30927281  0.35136533]
Range of Data Set =  4.4007850010737055


### Finding Quartiles
An image has been added below for better understanding..

<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/master/quartile_iqr.png" />
<a href="https://en.wikipedia.org/wiki/Quantile" traget="_blank" align="center">Image Credit : Wikipedia</a>

In [3]:
# Considering the last dataset - 'data_points'
''' 
For finding quartiles we have a module named percentiles in numpy. 
For first quartile we use 25%, for second we have 50% and for third quartile we use 75%.
'''
_Q1 = np.percentile(data_points, 25)   #first argument - dataset; second argument is quartile value we wish to find
_Q2 = np.percentile(data_points, 50)
_Q3 = np.percentile(data_points, 75)

print("First Quartile Value = ", _Q1)
print("Second Quartile Value = ", _Q2)
print("Third Quartile Value = ", _Q3)

First Quartile Value =  -0.8624952764885252
Second Quartile Value =  -0.11763738734193276
Third Quartile Value =  0.587686906083165


### Finding the Inter - Quartile Range(IQR)
IQR is nothing but the difference between the third quartile value and the first quartile Value..
The area shown in the blue color in the Quartile image depicts the IQR itself.
> So, IQR = (Q3-Q1) 

In [4]:
print("IQR for our dataset = ",(_Q3-_Q1))

IQR for our dataset =  1.45018218257169


### Finding Variance
Variance tells us that how far our data values lies from the mean/average value.
It is calculated for two different scenarios: For Sample and for whole population.
In Whole Polulation we consider all of the observations available in our data set whereas in sample scenario, we subtract 1 from the number of observations present in the sample set.

#### Formula for Population Scenario
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/579f3f7b996f16531b9a775826c8d54b724afdc1/variance_population.svg" />

#### Formula for Sample Scenarion
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/579f3f7b996f16531b9a775826c8d54b724afdc1/variance_sample.svg" />

In [5]:
#let us make some population out of random data points
population_set = np.random.randn(200)   #generating 200 values
population_set

array([-1.81823981, -1.15691915, -0.6403733 ,  1.19522986, -1.74366978,
       -0.75409388,  0.01174015,  1.1894297 ,  1.21489855,  0.8289737 ,
        0.7052364 ,  0.60813143, -1.87333103,  0.56600294,  2.12579344,
        0.93496638, -0.15951587, -1.01055288, -1.01282917, -0.03340658,
       -0.90182846,  0.72281713, -0.84830164, -0.63732206, -0.10009057,
       -1.27155165,  0.68766254,  0.21240806, -0.26854321,  2.09798384,
       -0.38350306, -0.7800808 , -2.37483567,  1.18868295, -0.32145799,
        0.19533823, -0.72624496,  0.15276737,  0.27280507,  1.26025953,
       -0.94705355,  1.23670088,  1.78584354,  0.73950766,  0.08805194,
       -0.71343737, -0.89773609, -1.22668962, -1.05708744, -2.13367059,
       -0.32818812, -1.38753736,  0.03150988,  0.85476584, -1.81353495,
       -0.41726506,  0.07652996,  0.92978952,  0.52793681, -0.39281333,
       -0.95752662, -0.2975471 , -1.21240992,  0.33452537,  0.44328322,
        0.76578536,  1.84027343,  0.46780634,  0.60957696, -0.93

In [6]:
# Now we will make a random selection of 90 values for our sample
sample_set = np.random.choice(population_set, 90)
sample_set

array([ 0.76323513, -0.285998  , -0.13225387,  0.60493962, -0.27985319,
       -2.06422384, -0.18633366,  1.20049047, -1.81823981, -0.73828941,
        1.08742055, -0.48166575,  0.43828917, -0.10009057,  1.44761684,
       -0.63732206,  0.03150988, -0.2237413 , -0.18101943, -0.93300223,
        1.23012655, -0.0681112 ,  0.60493962,  0.76323513,  2.75111935,
        0.92978952,  1.05733256,  0.36298082,  1.19522986,  0.29406309,
       -0.38350306,  0.68766254, -0.15951587, -0.92756095, -0.57975139,
        1.71954519,  0.60957696,  0.28466333,  0.76323513,  0.73950766,
       -1.01282917,  1.23670088, -1.63534691,  0.60493962,  0.01194166,
       -1.82150566, -0.93300223, -2.02336214, -0.72624496, -0.10009057,
       -2.13259483,  0.07767749,  0.93532196,  1.78584354, -0.87140663,
        1.23449602,  0.60813143,  0.8289737 , -0.46172257, -1.2040493 ,
        0.80532862, -0.4243984 , -0.31099831, -1.01282917,  0.27237925,
       -1.01282917, -0.75409388, -0.38350306, -1.81353495, -1.82

In [7]:
# To calculate variance of Population data set, we write like this
print("Population Variance = ", np.var(population_set))

Population Variance =  1.0916211610330695


In [8]:
# To calculate variance of Sample data set, we write like this
print("Sample Variance = ", np.var(sample_set))

Sample Variance =  1.0591793151496864


In [9]:
# Let us find the difference between the population variance and sample variance
print("Difference Population Variance and Sample Variance = ", (np.var(population_set) - np.var(sample_set)))

Difference Population Variance and Sample Variance =  0.03244184588338306


### Finding Standard Deviation
An image has been added for better understanding of the same.
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/075f813e2e14bd708c84e22460a0013ed99e32c5/stdev.svg" />
<a href="https://en.wikipedia.org/wiki/Standard_deviation" target="_blank">Image credit: Wikipedia</a>

#### Formula of Standard Deviation
Standard Deviation is square root of the variance...
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/5edc03049182015195fd24d227c8d02e1bc6b832/stdev_root_of_variance.svg"/>

Standard Deviation for the Population
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/5edc03049182015195fd24d227c8d02e1bc6b832/population_stdev.svg"/>

Standard Deviation for the Sample
<img src="https://raw.githubusercontent.com/dev-sandarbh/assets/5edc03049182015195fd24d227c8d02e1bc6b832/sample_stdev.svg"/>

In [10]:
# Finding standard deviation for population
print("Standard Deviation for Population dataset = ", np.std(population_set))

# Finding standard Deviation for sample
print("Standard Deviation for Sample dataset = ", np.std(sample_set))

Standard Deviation for Population dataset =  1.044806757746651
Standard Deviation for Sample dataset =  1.029164377128205
