In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

### Probability & Empirical Distributions of a Statistic

In [None]:
united = Table.read_table('united.csv')

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
num_simulations = 2000

In [None]:
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians (size=10)', sample_medians).hist(bins=20)

In [None]:
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians (size=1000)', sample_medians).hist(bins=np.arange(-2, 30))

#### Empirical Distributions of a Statistic (Overlayed)

In [None]:
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

## The Case of the Fake Data ##

In [None]:
theoretical_proportions = make_array(1/36, 35/36)
theoretical_proportions

In [None]:
sample_proportions(100, theoretical_proportions)

In [None]:
def simulated_snake_eyes():
    return sample_proportions(100, theoretical_proportions).item(0) * 100

In [None]:
simulated_snake_eyes()

In [None]:
num_snake_eyes = make_array()

for i in np.arange(10000):
    new_sim = simulated_snake_eyes()
    num_snake_eyes = np.append(num_snake_eyes, new_sim)

In [None]:
Table().with_column(
    'Number of Snake Eyes in 100 Rolls of Two Dice', num_snake_eyes
).hist()

# Plotting details; ignore this code
plots.scatter(8, 0.002, color='red', s=30, zorder=10);

## Mendel and Pea Flowers ##

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples

In [None]:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
purple_flowers()

In [None]:
purples = make_array()

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)

In [None]:
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()

In [None]:
# A better choice of statistic
def purple_discrepancy(num_purps):
    return abs(num_purps - 75)

In [None]:
Table().with_column('Discrepancy in sample of 929 if the model is true', purple_discrepancy(purples)).hist()

# Plotting details; ignore this code
plots.scatter(purple_discrepancy(observed_purples * 100), 0.007, color='red', s=30, zorder=10);