In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

## Lecture 23 ##

### Percentiles

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data
np.sort(x)

In [None]:
# Step 2. Figure out where 55th percentile would be.

In [None]:
# OR: 1 Line of Code
percentile(55, x)

In [None]:
# If we tried to compute which element to take...
55 / 100 * 6

### Sample Median

In [None]:
sf = Table.read_table('san_francisco_2015.csv')
sf

In [None]:
# Who is making the most money
sf.sort('Total Compensation', descending=True).show(5)

In [None]:
# Who is making the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
min_salary = 10 * 20 * 52
sf = sf.where('Total Compensation', are.above(min_salary))

In [None]:
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [None]:
our_sample = sf.sample(300, with_replacement=False)
our_sample.show(5)

In [None]:
percentile(50, our_sample.column('Total Compensation'))

In [None]:
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');

In [None]:
our_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Sample Distribution');

# Variability of the Estimate

In [None]:
def generate_sample_median(samp_size):
    our_sample = sf.sample(samp_size, with_replacement=False)
    return percentile(50, our_sample.column('Total Compensation'))

In [None]:
sample_median = generate_sample_median(300)
sample_median

In [None]:
error = sample_median - pop_median
error

# Quantifying Uncertainty

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(300)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

plots.scatter(pop_median, -1e-6, color="red");

In [None]:
err_bins = np.arange(-15000, 12501, 2500)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

plots.scatter(0, -1e-6, color="red");

# Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

In [None]:
def one_bootstrap_median():
    boot_resample = our_sample.sample()
    return percentile(50, boot_resample.column('Total Compensation'))

In [None]:
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, -1e-6, color="red");
plots.scatter(sample_median, -1e-6, color="blue");

## Confidence Intervals

The confidence interval is an interval based on the middle 95% of bootstrap samples.  The interval will be shown in yellow, the sample median (our estimate) in blue, and the true population median (the parameter) in red.

In [None]:
left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [-1e-6,-1e-6], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, -1e-6, color="red", zorder=2);
plots.scatter(sample_median, -1e-6, color="blue", zorder=2);