In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Average (Mean)

We can compute the average of some numeric data in a variety of ways:

In [None]:
values = make_array(2, 3, 3, 9)

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

It's often helpful to think of the average as the weighted sum of values:

In [None]:
2*(1/4) + 3*(2/4) + 9*(1/4)

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)

In [None]:
## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

In [None]:
np.average(values)

In [None]:
np.average(new_vals)

## Standard Deviations

Standard deviations (SDs) are our primary choice of measurement for quantifying the variability (spread) of
a distribution.

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.average(sd_table.column(0))
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

In [None]:
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data

variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
# Standard Deviation (SD) is the square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.sqrt(np.var(values))

In [None]:
np.std(values)

## The Normal Distribution

The Normal distribution is the most famous of the bell-shaped distributions. Let's analyze some approximately
normally distributed data:

In [None]:
births = Table.read_table('https://www.inferentialthinking.com/data/baby.csv')
births.show(3)

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

Approximately 95% of the data will be found within 2 SDs of the mean. Does that look correct?

In [None]:
np.mean(heights) - 2 * np.std(heights), np.mean(heights) + 2 * np.std(heights)

## The Central Limit Theorem

Let's apply the Central Limit Theorem (CLT) to United Airline Flight delay data collected at SFO in
the summer of 2015. 

In [None]:
united = Table.read_table('https://www.inferentialthinking.com/data/united_summer2015.csv')
united_bins = np.arange(-20, 300, 10)
united

In [None]:
united.hist('Delay', bins=united_bins)

In [None]:
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

In [None]:
percentile(50, delays)

In [None]:
def one_sample_mean(sample_size):
    """ Takes a sample from the population of flights and computes its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

one_sample_mean(100)

In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_100 = ten_thousand_sample_means(100)

In [None]:
sample_means_100

In [None]:
len(sample_means_100)

In [None]:
Table().with_column('Mean of 100 flight delays', sample_means_100).hist(bins=20)
print('Population Average:', round(delay_mean, 3))

In [None]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', round(delay_mean, 3))

In [None]:
sample_means_900 = ten_thousand_sample_means(900)
Table().with_column('Mean of 900 flight delays', sample_means_900).hist(bins=20)
print('Population Average:', round(delay_mean, 3))

## Distribution of the Sample Average

Now we'll see how the CLT relates to the distributuon of the sample average.

In [None]:
means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

In [None]:
united.num_rows

In [None]:
# How many possible sample means are there?
united.num_rows ** 400

In [None]:
delay_mean = np.mean(united.column('Delay'))
delay_sd = np.std(united.column('Delay'))

In [None]:
"""Empirical distribution of random sample means"""

def plot_and_summarize_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", delay_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("Population SD:", delay_sd)
    print("SD of sample means:", np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))

In [None]:
plot_and_summarize_sample_means(100)

In [None]:
(39.48 / 3.977)**2

In [None]:
plot_and_summarize_sample_means(400)

In [None]:
(39.48 / 2.01)**2