BerkeleyX: Data8.2x

Foundations of Data Science: Inferential Thinking by Resampling

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lec 10.1 Percentiles

In [None]:
# percentile: the smallest element in a set that is as least as large as p% of the elements
v = [1, 7, 3, 9, 5]
print(v)
print(percentile(25, v), percentile(50, v), percentile(80, v), percentile(99, v))

Lec 10.2 Estimation

In [None]:
sf = Table.read_table('../../data/san_francisco_2015.csv').select(3, 11, 21)
sf.set_format('Total Compensation', NumberFormatter(0))
sf = sf.where('Total Compensation', are.above(10000))
sf.show(3)

In [None]:
sf.sort('Total Compensation')

In [None]:
sf.sort('Total Compensation', descending=True)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=comp_bins, unit="dollar", normed=None, density=True)

In [None]:
# percentile is less influenced by outliers
percentile(50, sf.column('Total Compensation'))

In [None]:
sample_from_population = sf.sample(200, with_replacement=False)
sample_from_population.show(3)

In [None]:
# 50 percentile, aka median
percentile(50, sample_from_population.column('Total Compensation'))

In [None]:
np.median(sf.column('Total Compensation'))

In [None]:
np.median(sample_from_population.column('Total Compensation'))

Lec 10.3 Estimate Variability

In [None]:
medians = []
repetitions = np.arange(100)
for i in repetitions:
    sample = sf.sample(200, with_replacement=False)
    median = np.median(sample.column('Total Compensation'))
    medians.append(median)
    
Table().with_columns('trial', repetitions, 'median', medians).scatter('trial')

In [None]:
Table().with_column('medians', medians).hist(0, normed=None, density=True)

Lec 10.4 The Bootstrap

In [None]:
# get a sample w/o replacement from the actual population
sample_from_population = sf.sample(200, with_replacement=False)
sample_from_population.show(3)

In [None]:
# check the median of this sample
np.median(sample_from_population.column('Total Compensation'))

In [None]:
# resample w/ replacement - different median
resample = sample_from_population.sample()
np.median(resample.column('Total Compensation'))

In [None]:
# instead of resampling from the actual population, let's use the unique sample
medians = []

for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column('Total Compensation'))
    medians.append(median)
    
Table().with_column('Reampled median', medians).hist(normed=None, density=True)

In [None]:
# 95% confidence interval
percentile(2.5, medians), percentile(97.5, medians)

In [None]:
# 99% confidence interval
percentile(0.5, medians), percentile(99.5, medians)

In [None]:
# check how good is the method
intervals = Table(['Lower', 'Upper'])

for j in np.arange(100):
    sample_from_population = sf.sample(200, with_replacement=False)
    medians = []
    for i in np.arange(1000):
        resample = sample_from_population.sample()
        median = np.median(resample.column('Total Compensation'))
        medians.append(median)
        
    interval_95 = [percentile(2.5, medians),
                   percentile(97.5, medians)]
    
    intervals.append(interval_95)

In [None]:
# it should be around 95
truth = np.median(sf.column('Total Compensation'))
correct = intervals.where('Lower', are.not_above(truth)).where('Upper', are.not_below(truth))
correct.num_rows

In [None]:
intervals.where('Lower', are.above(truth))

In [None]:
intervals.where('Upper', are.below(truth))