In [None]:
from datascience import *
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Flight delays; a single sample

In [None]:
united = Table.read_table('http://inferentialthinking.com/data/united_summer2015.csv')

In [None]:
la = united.where('Destination', 'LAX')

In [None]:
sample_from_population = la.sample(100, with_replacement=False)

In [None]:
np.median(sample_from_population['Delay'])

## Variability among samples

In [None]:
medians = []
repetitions = np.arange(200)
for i in repetitions: # examine 200 samples
    sample = la.sample(100, with_replacement=False)
    medians.append(np.median(sample['Delay']))

Table().with_columns('i', repetitions, 'median', medians).scatter(0)

In [None]:
Table().with_columns('median', medians).hist(0)

In [None]:
percentile(2.5, medians)

In [None]:
percentile(97.5, medians)

In [None]:
print("95% of sample medians were between", 
      percentile(2.5, medians),
      "and",
      percentile(97.5, medians))

## Predicting the accuracy of our estimate -- the bootstrap

In [None]:
sample_from_population # from now on, this is all we have

In [None]:
np.median(sample_from_population['Delay'])

In [None]:
resample_1 = sample_from_population.sample()
resample_1.show(3)

In [None]:
np.median(resample_1['Delay'])

In [None]:
resample_2 = sample_from_population.sample()
resample_2.show(3)

In [None]:
np.median(resample_2['Delay'])

In [None]:
resampled_medians = make_array()
for i in np.arange(5000):
    resample = sample_from_population.sample()
    median = np.median(resample['Delay'])
    resampled_medians = np.append(resampled_medians, median)
    
Table().with_column('Resampled median', resampled_medians).hist(0)

## Confidence intervals

In [None]:
print("95% of resampled medians were between", 
      percentile(2.5, resampled_medians),
      "and",
      percentile(97.5, resampled_medians))

In [None]:
interval_95 = [percentile(2.5, resampled_medians),
               percentile(97.5, resampled_medians)]

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=5)
pop_median = np.median(la['Delay'])
plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
def bootstrap_median(sample_from_pop):
    resampled_medians = make_array()
    for i in np.arange(5000):
        resample = sample_from_pop.sample()
        median = np.median(resample['Delay'])
        resampled_medians = np.append(resampled_medians, median)
    return resampled_medians

def plot_bootstrap(sample_from_pop):
    resampled_medians = bootstrap_median(sample_from_pop)
    interval_95 = [percentile(2.5, resampled_medians),
                   percentile(97.5, resampled_medians)]
    Table().with_column('Resampled median', resampled_medians).hist(0)
    plots.plot(interval_95, [0, 0], color='gold', lw=5)
    pop_median = np.median(la['Delay'])
    plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
different_sample_from_population = la.sample(100)
print('Sample median:', np.median(different_sample_from_population['Delay']))

In [None]:
plot_bootstrap(different_sample_from_population)

In [None]:
# THE BIG SIMULATION: This one takes several minutes.

# Generate 100 intervals, in the table intervals

left_ends = make_array()
right_ends = make_array()

for i in np.arange(100):
    sample_from_pop = la.sample(100, with_replacement=False)
    medians = bootstrap_median(sample_from_pop)
    left_ends = np.append(left_ends, percentile(2.5, medians))
    right_ends = np.append(right_ends, percentile(97.5, medians))

intervals = Table().with_columns(
    'Left', left_ends,
    'Right', right_ends
)

In [None]:
intervals

In [None]:
intervals.where('Left', are.below(pop_median)).where('Right', are.above(pop_median)).num_rows

In [None]:
replication_number = np.ndarray.astype(np.arange(1, 101), str)

intervals2 = Table(replication_number).with_rows(make_array(left_ends, right_ends))

plots.figure(figsize=(8,8))
n=100
for i in np.arange(n):
    ends = intervals2.column(i)
    plots.plot(ends, make_array(i+1, i+1), color='gold')
plots.plot(make_array(pop_median, pop_median), make_array(0, n), color='red', lw=2)
plots.xlabel('Median (minutes)')
plots.ylabel('Replication')
plots.title('Population Median and Intervals of Estimates');