In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Pea Flowers

In [None]:
null = make_array('Purple', 'Purple', 'Purple', 'White')

def test_stat(sample):
    proportion_purple = np.count_nonzero(sample == 'Purple') / len(sample)
    return abs(proportion_purple - 0.75)

total_plants = 929
observed_statistic = 0.0088805166846070982 # 705 were Purple

In [None]:
# Simulating the test statistic under the null hypothesis
repetitions = 5000
sampled_stats = make_array()

for i in np.arange(repetitions):
    new_sample = np.random.choice(null, total_plants)
    sampled_stats = np.append(sampled_stats, test_stat(new_sample))

In [None]:
null_dist = Table().with_column('Distribution of the test statistic under the null hypothesis', sampled_stats)
null_dist.hist()
_ = plots.plot([observed_statistic, observed_statistic], [0, 60])

## P-value

In [None]:
# The P-value (an approximation based on the simulation)
np.count_nonzero(sampled_stats >= observed_statistic)/repetitions

## Test Statistics

In [None]:
def number_of_different_colors(sample):
    return Table().with_column('values', sample).group(0).num_rows

observed = 2

In [None]:
# Simulating the test statistic under the null hypothesis
repetitions = 500
sampled_stats = make_array()

for i in np.arange(repetitions):
    new_sample = np.random.choice(null, total_plants)
    sampled_stats = np.append(sampled_stats, number_of_different_colors(new_sample))

In [None]:
Table().with_column('Null distribution', sampled_stats).hist()
_ = plots.plot([observed, observed], [0, 10])

In [None]:
def number_of_purples(sample):
    return np.count_nonzero(sample == 'Purple')

observed = 705

In [None]:
# Simulating the test statistic under the null hypothesis
repetitions = 5000
sampled_stats = make_array()

for i in np.arange(repetitions):
    new_sample = np.random.choice(null, total_plants)
    sampled_stats = np.append(sampled_stats, number_of_purples(new_sample))

In [None]:
Table().with_column('Null distribution', sampled_stats).hist()
_ = plots.plot([observed, observed], [0, .03])

## Deflategate

In [None]:
football = Table.read_table('http://inferentialthinking.com/notebooks/football.csv')
football = football.drop('Team')
football.show()

In [None]:
initials = np.append(np.ones(11) * 12.5, np.ones(4) * 13)
averages = (football.column(1)+football.column(2))/2
football = football.with_column(
    'Team', np.char.strip(football.column('Ball'), [' 1234567890']),
    'Average at Half', averages,
    'Estimate at Start', initials,
    'Drop', initials - averages
)
football.show()

In [None]:
def difference_in_average_drop(t):
    averages = t.select('Team', 'Drop').group('Team', np.average).column(1)
    return averages.item(1) - averages.item(0)

observed = difference_in_average_drop(football)
observed

In [None]:
shuffled = football.select('Drop').sample(15, with_replacement=False)
football.select('Team').with_column('Drop', shuffled.column(0))

In [None]:
sampled_stats = make_array()

for i in np.arange(10000):
    shuffled = football.select('Drop').sample(15, with_replacement=False)
    sample = football.select('Team').with_column('Drop', shuffled.column(0))    
    sampled_stats = np.append(sampled_stats, difference_in_average_drop(sample))

In [None]:
Table().with_column('Null distribution', sampled_stats).hist()
_ = plots.plot([observed, observed], [0, 1.4])

In [None]:
np.count_nonzero(sampled_stats >= observed)/len(sampled_stats)