In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Percentiles

In [None]:
v = [1, 7, 3, 9, 5]
v

In [152]:
Table(['Percent', 'Percentile']).with_rows([
   [ 1, percentile( 1, v)], 
   [10, percentile(10, v)], 
   [20, percentile(20, v)], 
   [39, percentile(39, v)], 
   [40, percentile(40, v)], 
   [41, percentile(41, v)], 
   [50, percentile(50, v)], 
])

Percent,Percentile
1,1
10,1
20,1
39,3
40,3
41,5
50,5


## Estimation

In [None]:
sf = Table.read_table('san_francisco_2015.csv').select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(3)

In [None]:
sf.sort(2)

In [None]:
sf.sort(2, descending=True)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins, unit='dollar')

In [None]:
sample_from_population = sf.sample(200, with_replacement=False)
sample_from_population.show(3)

In [None]:
np.median(sample.column(2))

In [None]:
percentile(50, sample.column(2))

## Aside: lists and append

In [None]:
s = [2, 3]
s.append(4)
s

In [None]:
a = make_array(2, 3)
# a.append(4)
np.append(a, 4)
a

## Sample variability

In [None]:
medians = []
repetitions = np.arange(100)
for i in repetitions:
    sample = sf.sample(200, with_replacement=False)
    medians.append(np.median(sample.column(2)))

Table().with_columns('i', repetitions, 'median', medians).scatter(0)

In [None]:
Table().with_columns('median', medians).hist(0)

In [None]:
percentile(2.5, medians)

In [None]:
percentile(97.5, medians)

In [None]:
print("95% of sample medians were between", 
      percentile(2.5, medians),
      "and",
      percentile(97.5, medians))

## Bootstrap

In [None]:
sample_from_population # From now on, this is all we have.

In [None]:
np.median(sample_from_population.column(2))

In [None]:
resample_1 = sample_from_population.sample()
resample_1

In [None]:
np.median(resample_1.column(2))

In [None]:
resample_2 = sample_from_population.sample()
np.median(resample_2.column(2))

In [None]:
resampled_medians = []
for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column(2))
    resampled_medians.append(median)
    
Table().with_column('Resampled median', resampled_medians).hist(0)

## Confidence Intervals

In [None]:
print("95% of resampled medians were between", 
      percentile(2.5, resampled_medians),
      "and",
      percentile(97.5, resampled_medians))

In [None]:
interval_95 = [percentile(2.5, resampled_medians),
               percentile(97.5, resampled_medians)]

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=5)
pop_median = np.median(sf.column(2))
plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
sample_from_population = sf.sample(200)
print('Sample median:', np.median(sample_from_population.column(2)))

In [None]:
resampled_medians = []
for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column(2))
    resampled_medians.append(median)
    
interval_95 = [percentile(2.5, resampled_medians),
               percentile(97.5, resampled_medians)]

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=5)
pop_median = np.median(sf.column(2))
plots.scatter(pop_median, 0, color='red', s=400)

In [None]:
intervals = Table(['Lower', 'Upper'])

for j in np.arange(100):
    sample_from_population = sf.sample(200)
    resampled_medians = []
    for i in np.arange(1000):
        resample = sample_from_population.sample()
        median = np.median(resample.column(2))
        resampled_medians.append(median)

    interval_95 = [percentile(2.5, resampled_medians),
                   percentile(97.5, resampled_medians)]

    intervals.append(interval_95)
    print(j)

In [None]:
truth = np.median(sf.column(2))
correct = intervals.where('Lower', are.not_above(truth)).where('Upper', are.not_below(truth))
correct.num_rows