In [2]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 23 #

## Percentiles 

Suppose we wanted to manually compute the 55th percentile of the following array:

In [3]:
x = make_array(43, 20, 51, 7, 28, 34)

**Step 1.** To compute percentiles we first sort the data

In [4]:
sorted_x = np.sort(x)
sorted_x

array([ 7, 20, 28, 34, 43, 51])

In [21]:
ptbl = Table().with_columns(
    "Percentile", 100*(np.arange(0, len(x))+1)/len(x),
    "Element", sorted_x)
ptbl

Percentile,Element
16.6667,7
33.3333,20
50.0,28
66.6667,34
83.3333,43
100.0,51


**Step 2.** Figure out where the $p^\text{th}$ percentile would be.

In [16]:
p = 55
ind = int(np.ceil(len(x) * p/100) - 1)
ind

3

In [17]:
sorted_x.item(ind)

34

The above calculation is confusing and brittle (try p=0).  Instead, we should use the `percentile` function.

### Using the Percentile Function

In [18]:
percentile?

[0;31mSignature:[0m [0mpercentile[0m[0;34m([0m[0mp[0m[0;34m,[0m [0marr[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns the pth percentile of the input array (the value that is at
least as great as p% of the values in the array).

If arr is not provided, percentile returns itself curried with p

>>> percentile(74.9, [1, 3, 5, 9])
5
>>> percentile(75, [1, 3, 5, 9])
5
>>> percentile(75.1, [1, 3, 5, 9])
9
>>> f = percentile(75)
>>> f([1, 3, 5, 9])
5
[0;31mFile:[0m      /opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/datascience/util.py
[0;31mType:[0m      function


Recall the precentile table. 

In [22]:
ptbl

Percentile,Element
16.6667,7
33.3333,20
50.0,28
66.6667,34
83.3333,43
100.0,51


Let's try a few values.

In [23]:
percentile(50, x)

28

In [24]:
percentile(55, x)

34

In [25]:
percentile(0, x)

7

In [26]:
percentile(100, x)

51

<br><br><br><br>

---
<center> Return to Slides </center>

---

<br><br><br><br>

## Discussion Question

In [27]:
s = make_array(1, 3, 5, 7, 9)

In [28]:
Table().with_columns(
    "Percentile", 100*(np.arange(0, len(s))+1)/len(s),
    "Element", sorted(s))

Percentile,Element
20,1
40,3
60,5
80,7
100,9


In [29]:
percentile(10, s) == 0

False

In [30]:
percentile(39, s) == percentile(40, s)

True

In [31]:
percentile(40, s) == percentile(41, s)

False

In [32]:
percentile(50, s) == 5

True

<br><br><br><br>

---
<center> Return to Slides </center>

---

<br><br><br><br>

## Inference: Estimation

In [None]:
sf = Table.read_table('san_francisco_2019.csv')
sf.show(3)

In [None]:
# Who made the most money
sf.sort('Total Compensation', descending=True).show(5)

In [None]:
# Who made the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
# $15/hr, 20 hr/wk, 50 weeks

min_salary = 15 * 20 * 50
sf = sf.where('Salary', are.above(min_salary))

In [None]:
# Population Distribution

sf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)

In [None]:
# An Empirical Distribution

our_sample = sf.sample(400, with_replacement=False)
our_sample.hist('Total Compensation', bins=sf_bins)

In [None]:
# Parameter: Median Total Compensation 

pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [None]:
# Estimate: Median of a Sample

percentile(50, our_sample.column('Total Compensation'))

But in the real world we won't be able to keep going back to the population. How to generate a new random sample *without going back to the population?*

## Variability of the Estimate

In [None]:
def generate_sample_median(samp_size):
    new_sample = sf.sample(samp_size, with_replacement=False)
    return percentile(50, new_sample.column('Total Compensation'))

In [None]:
generate_sample_median(400)

## Quantifying Uncertainty

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(400)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
med_bins = np.arange(120000, 160000, 2000)
Table().with_column('Sample Medians', sample_medians).hist(bins=med_bins)

plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color='red');

# Bootstrap

Sample randomly
 - from the original sample
 - with replacement
 - the same number of times as the original sample size

In [None]:
# Default behavior of tbl.sample:
# at random with replacement,
# the same number of times as rows of tbl

bootstrap_sample = our_sample.sample()
bootstrap_sample.hist('Total Compensation', bins=sf_bins)
percentile(50, bootstrap_sample.column('Total Compensation'))

In [None]:
def one_bootstrap_median():
    # draw the bootstrap sample
    resample = our_sample.sample()
    # return the median total compensation in the bootstrap sample
    return percentile(50, resample.column('Total Compensation'))

In [None]:
one_bootstrap_median()

In [None]:
# Generate the medians of 1000 bootstrap samples
num_repetitions = 1000
bstrap_medians = make_array()
for i in np.arange(num_repetitions):
    bstrap_medians = np.append(bstrap_medians, one_bootstrap_median())

In [None]:
resampled_medians = Table().with_column('Bootstrap Sample Median', bstrap_medians)
median_bins=np.arange(120000, 160000, 2000)
resampled_medians.hist(bins = median_bins)

# Plotting parameters; you can ignore this code
parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2)
plots.title('Bootstrap Medians and the Parameter (Green Dot)');

## Percentile Method: Middle 95% of the Bootstrap Estimates 

In [None]:
left = percentile(2.5, bstrap_medians)
right = percentile(97.5, bstrap_medians)

make_array(left, right)

In [None]:
resampled_medians.hist(bins = median_bins)

# Plotting parameters; you can ignore this code
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);