<h1> Lecture 21

Data Science 8, Summer 2021 </h1>

<h3>
<b>
<ul>
<li>Bootstrap</li><br>
    
<li>Confidence Intervals  </li><br>

<li>Interpretation of Confidence Intervals  </li>
</ul>
</b>
</h3>

In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

#The following allows porting images into a Markdown window
#Syntax: ![title](image_name.png)
from IPython.display import Image

## Bootstrap: San Francisco City Salaries

The bootstrap lets us estimate the variability in an estimate.  In this case, we are estimating the median salary of all San Francisco employees, based on a sample of 300 employees (namely, the median salary of those 300 is our estimate for the population median); and we want to know about how much error this estimate will typically have.

<h2>City of San Francisco Employees: Sample Median</h2>

In [None]:
sf = Table.read_table('san_francisco_2015.csv')
sf

<h3>Declutter: Remove extraneous columns</h3>

In [None]:
sf=sf.select('Job','Total Compensation')
sf

<h4>Assume 
    <ul>
        <li> a minimum hourly wage of $10/hour in 2015.</li><br>
        <li> at least half-time employee (i.e., works at least 20 hours/week)</li>
    </ul>
</h4>

In [None]:
min_wage_per_hour = 10  #minimum hourly wage
hours_per_week    = 20  #half-time employee
weeks_per_year    = 52

min_salary=min_wage_per_hour * hours_per_week * weeks_per_year
min_salary

<h4>Keep only the rows where 'Total Compensation' is at least <tt>min_salary</tt>:</h4>

In [None]:
sf = sf.where('Total Compensation', are.above(min_salary)).sort('Total Compensation', descending=True)
sf

<h4>Population Size:</h4>

In [None]:
sf.num_rows

<h4>Median 'Total Compensation' of the entire data set:</h4>

In [None]:
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median 

<h4>Minimum 'Total Compensation' of the entire data set:</h4>

In [None]:
pop_min = sf.column('Total Compensation').min()
pop_min

<h4>Maximum 'Total Compensation' of the entire data set:</h4>

In [None]:
pop_max = max(sf.column('Total Compensation'))
pop_max

<h4>Set the histogram bins.</h4>

In [None]:
sf_bins = np.arange(0, 700000, 25000)
sf_bins

<h4>Plot the Population Distribution</h4>

In [None]:
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');
print("Popoulation Median =", pop_median)

<h4>Sample the population: use the sample median as an estimate of the population median.</h4> 

In [None]:
sample_size = 300
sample_size

In [None]:
our_sample = sf.sample(sample_size, with_replacement = False)
sample_median = percentile(50, our_sample.column('Total Compensation'))
print("Population median: $", pop_median)
print("Sample median: $", sample_median)

In [None]:
def generate_sample_median(sample_size):
    our_sample = sf.sample(sample_size, with_replacement=False)
    return percentile(50, our_sample.column('Total Compensation'))

<h4>Now run a large number of trials.</h4> 

In [None]:
num_simulations = 1000
sample_size = 300

#Create a blank sample_medians array
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = generate_sample_median(sample_size)
    sample_medians = np.append(sample_medians, new_median) #grow sample_medians array at each trial

<h4>Minimum and maximum sample medians:</h4>

In [None]:
print("Population median: $", pop_median)
print("Sample median: $", sample_median)

<h4>Minimum Sample Median</h4>

In [None]:
min(sample_medians)

<h4>Maximum Sample Median</h4>

In [None]:
max(sample_medians)

<h4>Create the bins, informed by min and max median values</h4>

In [None]:
median_bins = np.arange(90000, 125001, 2500)

In [None]:
#Create the bins, informed by min and max median values
median_bins = np.arange(90000, 125001, 2500)
#Create the histogram
Table().with_column(
    'Sample Medians', sample_medians).hist(bins = median_bins)
#Superimpose the true population median
plots.scatter(pop_median, -1e-6, color="red");

# Bootstrap

<h4> Take One Bootstrap Resample from the Sample, and plot the histrogram.<br>
    
Run the following cell several times.</h4>

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
sample_size = 300

# We now sample, with replacement, from our_sample
boot_sample = our_sample.sample(sample_size, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('One Bootstrap Sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

<h4>Define function <tt>one_bootstrap_median</tt> that takes a resample from the sample.</h4>

In [None]:
def one_bootstrap_median(sample_size):
    boot_resample = our_sample.sample(sample_size,with_replacement=True)
    return percentile(50, boot_resample.column('Total Compensation'))

<h4>Run a large number of trials&mdash;take many resamples.</h4>

In [None]:
num_simulations = 1000

bootstrap_medians = make_array()
for i in np.arange(num_simulations):
    new_median = one_bootstrap_median(sample_size)
    bootstrap_medians = np.append(bootstrap_medians, new_median)

<h4>Median of the Bootstrap Medians</h4>

In [None]:
bootstrap_medians_table = Table().with_column(
    'Bootstrap Medians', bootstrap_medians
)
boot_median=percentile(50, bootstrap_medians_table.column('Bootstrap Medians'))

In [None]:
bootstrap_medians_table.hist('Bootstrap Medians', bins=median_bins)

plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(sample_median, 0, color="blue", zorder=2);
plots.scatter(boot_median, -4e-6, color="orange", zorder=3);

In [None]:
# Questions
# Slides

## Calculating Confidence Intervals with the Bootstrap

The confidence interval is an interval based on the middle 95% of bootstrap samples.  The interval will be shown in yellow, the sample median (our estimate) in blue, and the true population median (the parameter) in red.

In [None]:
left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians')

plots.plot([left, right], [-1e-6,-1e-6], color="gold", lw=3, zorder=1);
plots.scatter(pop_median, -1e-6, color="red", zorder=2);
plots.scatter(sample_median, -1e-6, color="blue", zorder=2);
print("Left End of CI:", left)
print("Right end of CI:", right)

In [None]:
lowers = make_array()
uppers = make_array()

for i in np.arange(100):
    num_simulations = 1000
    bootstrap_medians = make_array()
    our_sample = sf.sample(sample_size, with_replacement = False)
    for i in np.arange(num_simulations):
        new_median = np.median(our_sample.sample().column("Total Compensation"))
        bootstrap_medians = np.append(bootstrap_medians, new_median)
    lower = percentile(2.5, bootstrap_medians)
    upper = percentile(97.5, bootstrap_medians)
    lowers = np.append(lowers, lower)
    uppers = np.append(uppers, upper)

plots.figure(figsize=(8,8));
for i in np.arange(100):
    plots.plot([lowers.item(i), uppers.item(i)], [i, i], color="gold", lw=2, zorder=1);
plots.plot([pop_median, pop_median], [-10, 100], color="red", zorder=2, lw=1);
plots.ylim(0, 100);
plots.title("Population Median and Intervals of Estimates");
plots.xlabel("Median (dollars)");
plots.ylabel("Replication");
