In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## `sample_proportions`

About 10% of Americans 18 and over suffer from a depressive illness in any given year. [[source](https://www.cdc.gov/workplacehealthpromotion/health-strategies/depression/index.html)]

In [None]:
distribution_in_pop = make_array(0.9, 0.1)

In [None]:
distribution_in_pop

Suppose you have 100 friends.  Let's simulate how many might suffer from a depressive illness, assuming we can treat each friend as drawn randomly from the population of US adults (a dubious assumption).

In [None]:
distribution_in_sample = sample_proportions(100, distribution_in_pop)

In [None]:
distribution_in_sample

## Large Random Samples

We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.

In [None]:
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 201, 5)

In [None]:
# (Population) Probability Distribution
united.hist('Delay', bins = united_bins)

In [None]:
# (Sample) Empirical Distribution
united.sample(10).hist('Delay', bins = united_bins)

In [None]:
# (Sample) Empirical Distribution
united.sample(1000).hist('Delay', bins = united_bins)

## Statistics

In [None]:
# (Population) Parameter
np.median(united.column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(10).column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(100).column('Delay'))

### Probability & Empirical Distributions of a Statistic

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
num_simulations = 2000

In [None]:
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians (size=10)', sample_medians).hist(bins=20)

In [None]:
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians (size=1K)', sample_medians).hist()

#### Empirical Distributions Overlayed

In [None]:
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

## Swain vs. Alabama ##

In [None]:
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
sample_proportions(100, population_proportions)

In [None]:
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
panels = make_array()

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
Table().with_column('Number of Black Men on Panel of 100', panels).hist(bins=np.arange(5.5,40.))

## Mendel and Pea Flowers ##

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples

In [None]:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
purple_flowers()

In [None]:
purples = make_array()

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)

In [None]:
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()

In [None]:
Table().with_column('Discrepancy in sample of 929 if the model is true', abs(purples- 75)).hist()

In [None]:
abs(observed_purples * 100 - 75)