# Lecture 24: Center and Spread


In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

# Average #

## Calculating the average (a.k.a. mean) ##

In [None]:
values = [2, 3, 3, 9]

In [None]:
(2 + 3 + 3 + 9)/4

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

## The Average and the Histogram ##

In [None]:
2 * (1/4) + 3 * (2/4) + 9 * (1/4)

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist(0, bins = bins_for_display)

In [None]:
ten_1s = np.ones(10)
ten_1s

In [None]:
ten_2s = 2 * ten_1s
ten_9s = 9 * ten_1s

In [None]:
twenty_3s = 3 * np.ones(20)

In [None]:
new_values = np.append(ten_2s, twenty_3s)
new_values = np.append(new_values, ten_9s)
new_values

In [None]:
len(new_values)

In [None]:
new_values_table = Table().with_columns('value', new_values)
new_values_table.hist(0, bins = bins_for_display)

In [None]:
np.average(values), np.average(new_values)

**Back to slides.**

## The Average and the Median ##

In [None]:
dist_1 = [1, 2, 2, 3, 3, 3, 4, 4, 5]
dist_2 = [1, 2, 2, 3, 3, 3, 4, 4, 10]

In [None]:
percentile(50, dist_1), percentile(50, dist_2)

In [None]:
np.average(dist_1), np.average(dist_2)

In [None]:
nba = Table.read_table('nba2013.csv')

In [None]:
nba.labels

In [None]:
nba.hist('Height', bins=np.arange(65.5, 90.5))

In [None]:
heights = nba.column('Height')
percentile(50, heights), np.average(heights)

**Back to slides.**

# Standard Deviation #

## Calculating the SD ##

In [None]:
values = [2, 3, 3, 9]
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.average(sd_table.column(0))
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

In [None]:
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data

variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
np.var(values)
#np.var(values,ddof=1)

In [None]:
# Standard Deviation (SD) is the square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.std(values)
np.std(values,ddof=1)

**Back to slides.**

## Chebyshev's Bounds ##

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births.labels

In [None]:
births.hist(overlay=False)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
within_3_SD = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
within_3_SD.num_rows/births.num_rows

In [None]:
for k in births.labels:
    values = births.column(k)
    ave = np.mean(values)
    sd = np.std(values)
    print()
    print(k)
    for z in np.arange(1, 6):
        selected = births.where(k, are.between(ave - z*sd, ave + z*sd))
        proportion = selected.num_rows/births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%' )

**Back to slides.**

## Standard Units ##

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x))/np.std(x)

In [None]:
ages = births.column('Maternal Age')
np.mean(ages), np.std(ages)

In [None]:
ages_standard_units = standard_units(ages)

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age In Standard Units', ages_standard_units
)
both

In [None]:
both.hist(0, bins = np.arange(15, 46, 2))

In [None]:
both.hist(1, bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);