In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## New material

In [None]:
united = Table.read_table('united.csv')
united

### Make sure you remember these terms!

In [None]:
np.median(united.column('Delay'))

In [None]:
ten_flights = united.sample(k=10)
ten_flights

In [None]:
np.median(ten_flights.column('Delay'))

**STOP**

### Types of samples

I'm going to add a column called `Row` that I think will help keep track of what is going on in these next few code cells.

In [None]:
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

#### **Task**: Take some *determinstic* samples of 5 rows from the `united` table.

In [None]:
united.where('Destination', 'JFK').take(np.arange(5))

In [None]:
united.take(np.arange(start = 75, stop = 100, step = 5))

In [None]:
united.take(make_array(100,217,324,432,510))

#### **Task**: Take a convenience sample of size 5 from the `united` table. 

In [None]:
united.take(np.arange(5))

#### **Task**: Take some *random* samples of 5 rows from the `united` table.

##### Way 1: randomly pick a starting point and pick every 5th row from that point

In [None]:
start = np.random.choice(np.arange(1000))

In [None]:
start

In [None]:
random_sample_1 = united.take(np.arange(start, stop=start + 25, step=5))
random_sample_1

##### Way 2: pick five rows at random

In [None]:
united.sample(k=5, with_replacement=False)

**STOP**

### Distributions also apply to random quantities

In [None]:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
die.hist(bins = make_array(1,2,3,4,5,6,7),unit = 'spot')
print('Probability (theoretical) distribution: Each spot has an equal chance of being rolled')

In [None]:
die.sample(10).hist(bins = make_array(1,2,3,4,5,6,7),unit = 'spot')
print('Empirical Distribution: this histogram will vary each time you run this code!')

**STOP**

### The LLN applies to empirical distributions

#### **Task (distributions)**: Create:
- The theoretical probability distribution of a six-sided die
- The observed probability distribution of a six-sided die after: 10, 100, 1,000 and 10,000 rolls.

**Theoretical distribution (reminder from earlier)**

In [None]:
die.hist(bins = make_array(1,2,3,4,5,6,7),unit = 'spot')
print('Probability (theoretical) distribution: Each spot has an equal chance of being rolled')

**Observed (empirical) distributions**

In [None]:
die.sample(5).hist(bins = make_array(1,2,3,4,5,6,7),unit = 'spot')

Let's make this easier using a function!

In [None]:
def observed_die_distribution(k):
    observed_die_rolls = die.sample(k)
    observed_die_rolls.hist(bins = make_array(1,2,3,4,5,6,7),unit = 'spot')

In [None]:
observed_die_distribution(10)
print("This doesn't look much like the theoretical distribution!")

In [None]:
observed_die_distribution(100)
print('This looks closer to the theoretical distribution!')

In [None]:
observed_die_distribution(1000)
print('This looks much more like the theoretical distribution!')

In [None]:
observed_die_distribution(10000)
print('This looks a lot like the theoretical distribution!')

**STOP**

### Distributions also apply to random quantities

#### **Task**: Find or create
- The median delay time (in minutes) across all United flights from 6/1/15 to 8/9/15.
- The observed distribution of median delay times after taking samples (*with replacement*) of: 10, 100, 1,000 and 10,000 randomly sampled flights.

In [None]:
np.median(united.column('Delay'))

In [None]:
ten_flights = united.sample(k=10)

In [None]:
np.median(ten_flights.column('Delay'))

In [None]:
def observed_sample_median(sample_size):
    
    k_flights = united.sample(k=sample_size)
    
    return np.median(k_flights.column('Delay'))

In [None]:
observed_sample_median(100)

##### **Discussion** [1 min]: 
- What are the data type(s) of the input?
- What is the data type of the output?

In [None]:
def observed_sample_median_distribution(repetition_size, sample_size):

    observed_sample_medians = make_array()

    for i in np.arange(repetition_size):
        new_median = observed_sample_median(sample_size)
        observed_sample_medians = np.append(observed_sample_medians, new_median)

    return observed_sample_medians

Discussion [1 min]: For the LLN to be invoked, what kinds of numbers should you put in the `sample_size` argument?

- 10?
- 1000?

In [None]:
results = observed_sample_median_distribution(repetition_size=1000,sample_size=...)

In [None]:
Table().with_column('Sample medians', results).hist()

## Review
- conducted at https://pollev.com/jeremysanchez.