# Lecture 15: Sampling

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Probability: Calculation vs Simulation

Roll a six-sided die 20 times. What's P(at least one 6)?

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

### Calculation

In [None]:
1 - (5/6) ** 20

In [None]:
rolls = np.arange(1, 51, 1)
at_least_one = Table().with_columns(
    'Rolls', rolls,
    'Chance of at least one 6', 1 - (5/6)**rolls
)
at_least_one.set_format(1, PercentFormatter(3))

In [None]:
at_least_one.scatter('Rolls')

In [None]:
at_least_one.where('Rolls', 20)

### Simulation

How many of the following return a random number from 1 to 6 (inclusive), with each number equally likely?

- `np.random.choice(np.arange(6))`  
- `np.random.choice(np.arange(6)+1)`  
- `np.random.choice(np.arange(1,6+1))`  
- `np.random.choice(np.arange(1,3+1))` + `np.random.choice(np.arange(1,3+1))`

A. 0  
B. 1  
C. 2  
D. 3  
E. 4  

<br/><br/><br/><br/><br/><br/>

In [None]:
rolls = np.random.choice(np.arange(1,6+1), 20)
rolls

In [None]:
sum(rolls==6)>0

In [None]:
# at least one 6?
sum(rolls==6) > 0

In [None]:
# Simulate that experiment many times

trials = 10000
successes = 0  # success means at least one 6

for i in np.arange(trials):
    rolls = np.random.choice(np.arange(1,6+1), 20)
    if sum(rolls==6) > 0:
        successes = successes + 1

successes/trials

## Sampling

In [None]:
top = Table.read_table('top_movies.csv')
top = top.with_column('Row Index', np.arange(top.num_rows)).move_to_start('Row Index')
top

### Deterministic sampling

In [None]:
top.take([3, 5, 7])

In [None]:
top.where('Title', are.containing('and the'))

### Probabilistic (random) sampling

In [None]:
start = np.random.choice(np.arange(10))
top.take(np.arange(start, start+5))

In [None]:
indices = np.arange(1, top.num_rows+1)
indices

In [None]:
random_indices = np.random.choice(indices, 5)
random_indices

In [None]:
top.take(random_indices)

In [None]:
top.sample(1)

In [None]:
top.take(np.arange(5)).sample(5, with_replacement=False)

In [None]:
top.take(np.arange(5)).sample(5, with_replacement=True)

## Sampling from dice

In [None]:
d6 = Table().with_column('face', np.arange(1,6+1))
d6

In [None]:
def face_hist(t):
    """
    Draw a histogram of the number of times each face (1-6) 
    occurs in table t.
    """
    t.hist('face', bins=np.arange(0.5, 7, 1), unit='face')
    plots.xlabel('Face')

In [None]:
face_hist(d6.sample(10000))  # 100, 1_000, 10_000, 100_000

**Discussion question:** What do you expect to happen to the histogram as the sample size increases?

In [None]:
for i in np.arange(1,5+1):
    face_hist(d6.sample(10 ** i))

In [None]:
# With larger samples, getting closer and closer to:
face_hist(d6)

## Sampling flights

In [None]:
united = Table.read_table('united.csv')
united

In [None]:
# Who knows the probability distribution?
# But let's take that table as the population.
united.hist('Delay')

In [None]:
def delay_hist(t):
    t.hist('Delay', unit='minute', bins=np.arange(-20, 151, 1))
    
delay_hist(united)

In [None]:
# Most delays between 1 and 20 min
between1and20 = united.where('Delay', are.between(1, 20)).num_rows
num_delays = united.where('Delay', are.above(0)).num_rows
between1and20 / num_delays

In [None]:
# Empirical distribution of a random sample
# looks like probability distribution, and more
# so as sample size increases
s = united.sample(10000)
delay_hist(s)

In [None]:
# Most delays between 1 and 20 min
between1and20 = s.where('Delay', are.between(1, 20)).num_rows
num_delays = s.where('Delay', are.above(0)).num_rows
between1and20 / num_delays