In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Central Limit Theorem

In [None]:
coin = Table().with_column(
    'Face', ['Heads', 'Tails'],
    'Chance', [0.6, 0.4])
coin


In [None]:
sample = coin.sample_from_distribution(1, 1000)
sample

In [None]:
sample.column(2).item(0)

In [None]:
heads = []
for i in np.arange(100000):
    sample = coin.sample_from_distribution(1, 1000)
    heads.append(sample.column(2).item(0))
    
Table().with_column('Heads out of 1000', heads).hist(bins=25)

In [None]:
united = Table.read_table('http://inferentialthinking.com/notebooks/united_summer2015.csv')
united

In [None]:
united.hist('Delay', bins=30)

In [None]:
mean_delay = np.mean(united.column('Delay'))
sd_delay = np.std(united.column('Delay'))

[mean_delay, sd_delay]

In [None]:
delay = united.select('Delay')
np.mean(delay.sample(400).column(0))

In [None]:
means = []
for i in np.arange(10000):
    sample = delay.sample(400)
    means.append(np.mean(sample.column(0)))

In [None]:
Table().with_column('Sample mean', means).hist(bins=30, unit='minute')

## Variability of the sample mean

In [None]:
def sample_means(sample_size):
    means = []
    for i in np.arange(10000):
        sample = delay.sample(sample_size)
        means.append(np.mean(sample.column(0)))
    return means

In [None]:
Table().with_column(
    '400', sample_means(400), 
    '900', sample_means(900),
    '2500', sample_means(2500),
).hist(bins=30, unit='minute')

In [None]:
sd_delay

In [None]:
def variability(sample_size):
    means = sample_means(sample_size)
    Table().with_column('Sample mean', means).hist(bins=30, unit='minute')
    sqrt_n = np.sqrt(sample_size)
    print('Sample size:          ', sample_size)
    print('Square root n:        ', sqrt_n)
    print('Sample mean SD:       ', np.std(means)) # 7 spaces
    print(sd_delay, '/', sqrt_n, '=', sd_delay / sqrt_n)

In [None]:
variability(400)

In [None]:
variability(625)

In [None]:
variability(900)

In [None]:
variability(1600)

In [None]:
variability(2500)

In [None]:
variability(1)

In [None]:
variability(2)

In [None]:
variability(4)

In [None]:
variability(16)

In [None]:
variability(30)

In [None]:
variability(50)

In [None]:
variability(100)

In [None]:
variability(200)

In [None]:
variability(400)

## Experiment Design

In [None]:
votes = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
np.std(votes)

In [None]:
votes = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
np.std(votes)

In [None]:
votes = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
np.std(votes)

In [None]:
worst_sd_of_population = 0.5
width_in_sds = 4 # For 95% of a normal distribution, mean +/- 2 SDs
desired_width = 0.03 
sample_size = (worst_sd_of_population * width_in_sds / desired_width) ** 2
sample_size

In [None]:
n = 4445

### Conducting the experiment

In [None]:
observed_sample = coin.select(0).sample(n, weights=coin.column('Chance'))
observed_sample

In [None]:
means = []
for i in np.arange(1000):
    resample = observed_sample.sample()
    means.append(np.count_nonzero(resample.column(0) == 'Heads') / n)
print(percentile(2.5, means), percentile(97.5, means))