In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Interval Estimates

In [None]:
sf = Table.read_table('http://inferentialthinking.com/notebooks/san_francisco_2015.csv').select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(3)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins)

In [None]:
pop_median = np.median(sf.column(2))
pop_median

In [None]:
sample_from_population = sf.sample(500, with_replacement=False)

resampled_medians = []
for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column(2))
    resampled_medians.append(median)
    
interval_95 = [
    percentile(2.5, resampled_medians),
    percentile(97.5, resampled_medians)
]

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=8)
pop_median = np.median(sf.column(2))
plots.scatter(pop_median, 0, color='red', s=400);

In [None]:
def bootstrap_median(sample_from_population, label, repetitions):
    resampled_medians = []
    for i in np.arange(repetitions):
        resample = sample_from_population.sample()
        median = np.median(resample.column(label))
        resampled_medians.append(median)
    return resampled_medians

In [None]:
# THE BIG SIMULATION: This one takes several minutes.

# Generate 100 intervals, in the table intervals

left_ends = make_array()
right_ends = make_array()

total_comps = sf.select(2)
for i in np.arange(100):
    sample_from_pop = total_comps.sample(200, with_replacement=False)
    medians = bootstrap_median(sample_from_pop, 'Total Compensation', 5000)
    left_ends = np.append(left_ends, percentile(2.5, medians))
    right_ends = np.append(right_ends, percentile(97.5, medians))

intervals = Table().with_columns(
    'Left', left_ends,
    'Right', right_ends
)    

In [None]:
intervals

In [None]:
intervals.where('Left', are.below(pop_median)).where('Right', are.above(pop_median)).num_rows

In [None]:
replication_number = np.ndarray.astype(np.arange(1, 101), str)

intervals2 = Table(replication_number).with_rows(make_array(left_ends, right_ends))

plots.figure(figsize=(8,8))
n=100
for i in np.arange(n):
    ends = intervals2.column(i)
    plots.plot(ends, make_array(i+1, i+1), color='gold')
plots.plot(make_array(pop_median, pop_median), make_array(0, n), color='red', lw=2)
plots.xlabel('Median (dollars)')
plots.ylabel('Replication')
plots.title('Population Median and Intervals of Estimates');

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births

In [None]:
babies = births.select('Birth Weight', 'Gestational Days')
babies

In [None]:
ratios = babies.with_column(
    'Ratio BW/GD', babies.column(0)/babies.column(1)
)
ratios

In [None]:
ratios.hist(2)

In [None]:
np.median(ratios.column(2))

In [None]:
resampled_medians = bootstrap_median(ratios, 2, 5000)

In [None]:
interval_95 = make_array(
    percentile(2.5, resampled_medians),
    percentile(97.5, resampled_medians)
)

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=8)
print('Approximate 95% Bootstrap Confidence Interval for the Population Median')
print(np.round(interval_95, 4))

In [None]:
interval_80 = make_array(
    percentile(..., resampled_medians),
    percentile(..., resampled_medians)
)

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_80, [0, 0], color='gold', lw=8)
print('Approximate 80% Bootstrap Confidence Interval for the Population Median')
print(np.round(interval_80, 4))

In [None]:
def bootstrap_mean(sample_from_population, label, repetitions):
    resampled_means = []
    ...
    return resampled_means

In [None]:
def bootstrap_ci_mean(sample_from_population, label, repetitions):
    resampled_means = bootstrap_mean(sample_from_population, label, repetitions)
    
    interval_95 = make_array(
    percentile(2.5, resampled_means),
    percentile(97.5, resampled_means)
    )
    
    Table().with_column('Resampled mean', resampled_means).hist(0)
    plots.plot(interval_95, [0, 0], color='gold', lw=8)
    print('Approximate 95% Bootstrap Confidence Interval for Population Mean:')
    print(np.round(interval_95, 3))

In [None]:
bootstrap_ci_mean(births, 'Maternal Age', 5000)

In [None]:
bootstrap_ci_mean(births, 'Birth Weight', 5000)

In [None]:
...

In [None]:
bootstrap_ci_mean(births, 'Maternal Smoker', 5000)