In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
sf = Table.read_table('san_francisco_2015.csv').select(3, 11, 21)
sf.set_format('Total Compensation', NumberFormatter(0))
sf = sf.where('Total Compensation', are.above(10000))
sf.show(3)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=comp_bins, unit='dollar')

In [None]:
pop_median = np.median(sf.column('Total Compensation'))
pop_median

In [None]:
sample_from_population = sf.sample(200, with_replacement=False)
sample_from_population.show(3)
np.median(sample_from_population.column('Total Compensation'))

In [None]:
# Calculate the median for each of 100 samples 
# of size 200 from the population

num_samples = 100
sample_size = 200
sample_medians = make_array()
repetitions = np.arange(num_samples)
for i in repetitions:
    sample = sf.sample(sample_size, with_replacement=False)
    median = np.median(sample.column('Total Compensation'))
    sample_medians = np.append(sample_medians, median)

#Table().with_columns('trial', repetitions, 'median', medians).scatter('trial')
Table().with_columns('median', sample_medians).hist()
plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
# Use the bootstrap to estimate 95% confidence interval
# for median salary in the population based on a single sample

sample_size = 200
sample = sf.sample(sample_size, with_replacement=False)

resampled_medians = make_array()
for i in np.arange(1000):
    resample = sample.sample(sample_size)
    median = np.median(resample.column('Total Compensation'))
    resampled_medians = np.append(resampled_medians, median)
    
lower = percentile(2.5, resampled_medians)
upper = percentile(97.5, resampled_medians)
interval_95 = [lower, upper]

#print(interval_95)
Table().with_column('Resampled median', resampled_medians).hist('Resampled median')
plots.plot(interval_95, [0, 0], color='gold', lw=5)
plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
# Calculate 95% bootstrap confidence interval for each of 100
# samples of size 200

num_samples = 100
sample_size = 200
all_lower = make_array()
all_upper = make_array()

for i in np.arange(num_samples):
    sample = sf.sample(sample_size, with_replacement=False)

    resampled_medians = make_array()    
    for _ in np.arange(1000):
        resample = sample.sample()
        median = np.median(resample.column('Total Compensation'))
        resampled_medians = np.append(resampled_medians, median)
    
    lower = percentile(2.5, resampled_medians)
    upper = percentile(97.5, resampled_medians)
    print('Sample = ',i+1,', Interval Contains Population Median = ',\
          (lower < pop_median)*(upper > pop_median))
    
    all_lower = np.append(all_lower, lower)
    all_upper = np.append(all_upper, upper)

In [None]:
intervals = Table().with_columns('Lower',all_lower,'Upper',all_upper)
intervals.scatter('Lower')
plots.hlines(pop_median,xmin=min(all_lower),xmax=pop_median)
plots.vlines(pop_median,ymin=pop_median,ymax=max(all_upper))
plots.scatter(pop_median, pop_median, color='red', s=200)