In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Large Random Samples ##

In [None]:
united = Table.read_table('united.csv')
united 

Some deterministic samples:

In [None]:
united.where("Destination", "JFK")

In [None]:
united.take(np.arange(0, united.num_rows, 1000))

In [None]:
united.take(make_array(77, 103, 2405))

A random sample:

In [None]:
start = np.random.choice(np.arange(united.num_rows))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample

## Distributions

In [None]:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
die.sample(10)

In [None]:
die.hist(bins=np.arange(0.5, 6.6, 1))

In [None]:
die.sample(10).hist(bins=np.arange(0.5, 6.6, 1))

In [None]:
die.sample(1000).hist(bins=np.arange(0.5, 6.6, 1))

In [None]:
die.sample(10000).hist(bins=np.arange(0.5, 6.6, 1))

## `sample_proportions`

About 10% of Americans 18 and over suffer from a depressive illness in any given year. [[source](https://www.cdc.gov/workplacehealthpromotion/health-strategies/depression/index.html)]

In [None]:
distribution_in_pop = make_array(0.9, 0.1)
distribution_in_pop

Suppose you have 100 friends.  Let's simulate how many might suffer from a depressive illness, assuming we can treat each friend as drawn randomly from the population of US adults (a dubious assumption).

In [None]:
distribution_in_sample = sample_proportions(100, distribution_in_pop)

In [None]:
distribution_in_sample

## Large Random Samples

We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.

In [None]:
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 201, 5)
united

In [None]:
# (Population) Probability Distribution
united.hist('Delay', bins = united_bins)

In [None]:
# (Sample) Empirical Distribution
united.sample(10).hist('Delay', bins = united_bins)

In [None]:
# (Sample) Empirical Distribution
united.sample(1000).hist('Delay', bins = united_bins)

## Simulating Statistics ##

In [None]:
# (Population) Parameter
np.median(united.column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(10).column('Delay'))

## Probability and Empirical Distributions of a Statistic

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column(
    'Sample medians', sample_medians).hist(bins = np.arange(-10,31))