In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Distribution of the Sample Average

We will work with the flight delays dataset again (flight delays for United flights in summer 2015).

In [None]:
united = Table.read_table('united_summer2015.csv')
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)

In [None]:
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

We'll write code to generate a sample of flights, and take the mean delay in that sample.  We call that the *sample mean*.  Remember that the sample mean is an estimate of the population mean (i.e., mean flight delay of all flights).

In [None]:
def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

Let's take many samples, compute the mean of each, and look at the distribution of these sample means.

In [None]:
def ten_thousand_sample_means(sample_size):
    """Use the bootstrap to approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', delay_mean)

How many possible ways are there that the sample could have come out?  In other words, how many possible samples are there?

In [None]:
united.num_rows

In [None]:
# How many possible sample means are there?
united.num_rows ** 400

Far too many samples to enumerate them all!  So while there *is* a well-defined distribution given by all possible sample means from all samples, it is too hard to compute it exactly.  Instead, we approximate this distribution by drawing 10000 samples from it and drawing the histogram of those samples.

Now let's look at how the shape of this distribution of sample means depends on the size of the sample.

In [None]:
sample_means_900 = ten_thousand_sample_means(900)

In [None]:
means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

How would you interpret this picture?  What does it tell you about the effect of increasing the sample size?

In [None]:
"""Empirical distribution of random sample means"""

def plot_and_summarize_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", delay_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("Population SD:", delay_sd)
    print("SD of sample means:", np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))

In [None]:
plot_and_summarize_sample_means(100)

In [None]:
39.48 / 3.932

In [None]:
plot_and_summarize_sample_means(400)

In [None]:
39.48 / 1.973

In [None]:
plot_and_summarize_sample_means(625)

In [None]:
39.48 / 1.577

In [None]:
39.48 / np.sqrt(100)

In [None]:
39.48 / np.sqrt(400)

In [None]:
39.48 / np.sqrt(625)