In [4]:
import numpy as np
import pandas as pd
houses = pd.read_table('AmesHousing_1.txt',sep='\t')

In [2]:
A = np.array([4,4,4,4])
B = np.array([0,8,0,8])

In [3]:
print(A.mean())
print(B.mean())

4.0
4.0


### Measure of Variability (Range)

To **measure the variability**/**range** of a distribution :

* to find the difference between the maximum and the minimum value.

<center>
$max(A) - min(A)$

$max(B) - max(B)$

$range(X) = max(X) - min(X)$
</center>

In [31]:
def todict(row):
    range_dict = {}
    range_dict[row['Yr Sold']] = row['SalePrice']
    return range_dict

In [32]:
range_by_year = houses.groupby('Yr Sold')['SalePrice'].\
apply(lambda x: x.max() - x.min()).reset_index().\
apply(todict,axis=1)
        

In [33]:
range_by_year.tolist()

[{2006: 590000},
 {2007: 715700},
 {2008: 601900},
 {2009: 575100},
 {2010: 598868}]

In [30]:
def find_range(arr):
    return arr.max() - arr.min()

In [35]:
range_by_year = {}
for i in houses['Yr Sold'].unique():
    filter_df = houses[houses['Yr Sold']==i]
    range_by_year[i]=find_range(filter_df['SalePrice'])
    

In [36]:
range_by_year

{2010: 598868, 2009: 575100, 2008: 601900, 2007: 715700, 2006: 590000}

In [38]:
for k,v in range_by_year.items():
    print(k,v)

2010 598868
2009 575100
2008 601900
2007 715700
2006 590000


### Average Distance


<center>
average distance = $\large \frac{(x_1-\mu)+(x_2-\mu)+...+(x_n-\mu)}{N}$
</center>
<br>
<center>
= $\large \frac{\sum_{i=1}^n(x_i - \mu)}{N}$
</center>

In [43]:
def average_distance(arr):
    mean = np.mean(arr)
    lst = []
    for value in arr:
        lst.append(value-mean)
    return np.mean(lst)

In [44]:
C = [1,1,1,1,1,1,1,1,1,21]

In [45]:
avg_distance = average_distance(C)
avg_distance

0.0

### Mean Absolute Deviation @ Average Absolute Deviation

<center>
mean absolute distance = $\large \frac{|x_1-\mu|+|x_2-\mu|+...+|x_n-\mu|}{N}$
</center>
<br>
<center>
= $\large \frac{\sum_{i=1}^n|x_i - \mu|}{N}$
</center>

In [46]:
def mean_distance(arr):
    mean = np.mean(arr)
    lst = []
    for i in arr:
        lst.append(abs(i-mean))
    return np.mean(lst)

In [47]:
mad = mean_distance(C)
mad

3.6

### Variance

measure of variability is sometimes called mean squared distance or mean squared deviation (remember that "distance" and "deviation" are synonymous in this context). However, it's more commonly known as variance.

<center>
variance = mean squared distance = $\large \frac{(x_1-\mu)^2+(x_2-\mu)^2+...+(x_n-\mu)^2}{N}$
</center>
<br>
<center>
= $\large \frac{\sum_{i=1}^N(x_i - \mu)^2}{N}$
</center>

**Squaring the distances** or **taking their absolute values** ensure that we get a **variability value that is greater than 0** for all distributions that show some variability. 

> Notice, however, that variance and mean absolute deviation will still be 0 for distributions that show no variability.

In [48]:
D = [2,2,2,2]

In [49]:
mean_distance(D)

0.0

In [50]:
def variance_distance(arr):
    mean = np.mean(arr)
    lst = []
    for i in arr:
        mean_squared = (i-mean)**2
        lst.append(mean_squared)
    return np.mean(lst)

In [51]:
variance_distance(C)

36.0