In [None]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

## Bootstrap: San Francisco City Salaries

The bootstrap lets us estimate the variability in an estimate.  In this case, we are estimating the median salary of all San Francisco employees, based on a sample of 300 employees (namely, the median salary of those 300 is our estimate for the population median); and we want to know about how much error this estimate will typically have.

In [None]:
sf_pop = Table.read_table('san_francisco_2015.csv')
sf_pop = sf_pop.where('Total Compensation', are.above(10*40*52))
pop_median = percentile(50, sf_pop.column('Total Compensation'))
print("Population median is $", pop_median)

our_sample = sf_pop.sample(300, with_replacement = False)
sample_median = percentile(50, our_sample.column('Total Compensation'))
print("Sample median (our estimate of the population median) is $", sample_median)

In [None]:
def one_bootstrap_median():
    bootstrap_resample = our_sample.sample()
    return percentile(50, bootstrap_resample.column('Total Compensation'))

In [None]:
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, -1e-6, color="red");
plots.scatter(sample_median, -1e-6, color="blue");

## Calculating Confidence Intervals with the Bootstrap

The confidence interval is an interval based on the middle 95% of bootstrap samples.  The interval will be shown in yellow, the sample median (our estimate) in blue, and the true population median (the parameter) in red.

In [None]:
left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [-1e-6,-1e-6], color="gold", lw=3, zorder=1);
plots.scatter(pop_median, -1e-6, color="red", zorder=2);
plots.scatter(sample_median, -1e-6, color="blue", zorder=2);

## Confidence Interval for the Mean Maternal Age

In [None]:
births = Table.read_table('baby.csv')
births.show(5)

In [None]:
births.hist('Maternal Age')

In [None]:
pop_mean_age = np.mean(births.column('Maternal Age'))
pop_mean_age

In [None]:
def one_bootstrap_mean():
    return np.mean(births.sample().column('Maternal Age'))

In [None]:
bootstrap_means = make_array()

for i in np.arange(1000):
    new_mean = one_bootstrap_mean()
    bootstrap_means = np.append(bootstrap_means, new_mean)
    
left = percentile(2.5, bootstrap_means)
right = percentile(97.5, bootstrap_means)

In [None]:
Table().with_column('Bootstrap means', bootstrap_means).hist()
plots.plot([left,right], [-1e-4, -1e-4], color="gold", lw=3, zorder=1);
plots.scatter(pop_mean_age, -1e-4, color="blue", zorder=2);

## Average (Mean) ##

In [None]:
values = make_array(2, 3, 3, 9)

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

In [None]:
2*(1/4) + 3*(2/4) + 9*(1/4)

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist(0, bins = bins_for_display)

In [None]:
## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

In [None]:
np.average(values)

In [None]:
np.average(new_vals)

### Discussion Question

In [None]:
nba = Table.read_table('nba2013.csv')
nba

In [None]:
nba.hist('Height', bins=np.arange(65.5, 90.5))

In [None]:
heights = nba.column('Height')
percentile(50, heights)

In [None]:
np.average(heights)

## Standard Deviation ##

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.average(sd_table.column(0))
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

In [None]:
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data

variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
# Standard Deviation (SD) is the square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.std(values)