In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Pea Flowers

In [None]:
total_plants = 929
observed_statistic = 0.0088805166846070982 # 705 were Purple

def test_stat(sample):
    proportion_purple = np.count_nonzero(sample == 'Purple') / len(sample)
    return abs(proportion_purple - 0.75)

def simulate_once():
    possible_colors = make_array('Purple', 'Purple', 'Purple', 'White')
    sample = np.random.choice(possible_colors, total_plants)
    return test_stat(sample)

cutoff = 0.028 # Don't worry how I chose this

def simulate_and_conclude():
    s = simulate_once()
    if s >= cutoff:
        print('Reject the null; test statistic was', s)
    else:
        print('Inconclusive - cannot reject the null; test statistic was', s)

In [None]:
simulate_and_conclude()

## Error probabilities

In [None]:
# Simulating the test statistic under the null hypothesis
repetitions = 40000
sampled_stats = make_array()

for i in np.arange(repetitions):
    s = simulate_once()
    sampled_stats = np.append(sampled_stats, s)

In [None]:
null_dist = Table().with_column('Distribution of the test statistic under the null hypothesis', sampled_stats)
null_dist.hist()
_ = plots.plot([cutoff, cutoff], [0, 60])

In [None]:
null_dist.where(0, are.above_or_equal_to(cutoff)).num_rows / repetitions

## A different cutoff

In [None]:
other_cutoff = 0.037
null_dist.hist()
_ = plots.plot([other_cutoff, other_cutoff], [0, 60])

In [None]:
null_dist.where(0, are.above_or_equal_to(other_cutoff)).num_rows / repetitions

## Yet another cutoff

In [None]:
third_cutoff = 0.0088805166846070982
null_dist.hist()
_ = plots.plot([third_cutoff, third_cutoff], [0, 60])

In [None]:
null_dist.where(0, are.above_or_equal_to(third_cutoff)).num_rows / repetitions

## P-value

In [None]:
# The P-value (an approximation based on the simulation)
null_dist.where(0, are.above_or_equal_to(observed_statistic)).num_rows / repetitions

## Deflategate

In [None]:
football = Table.read_table('http://inferentialthinking.com/notebooks/football.csv')
football = football.drop('Team')
football.show()

In [None]:
initials = np.append(np.ones(11) * 12.5, np.ones(4) * 13)
averages = (football.column('Blakeman')+football.column('Prioleau'))/2
football = football.with_column(
    'Team', np.char.strip(football.column('Ball'), [' 1234567890']),
    'Average at Half', averages,
    'Estimate at Start', initials,
    'Drop', initials - averages
)
football.show()

In [None]:
def difference_in_average_drop(t):
    averages = t.select('Team', 'Drop').group('Team', np.average).column(1)
    return averages.item(1) - averages.item(0)

observed = difference_in_average_drop(football)
observed

In [None]:
shuffled = football.select('Drop').sample(15, with_replacement=False)
football.select('Team').with_column('Drop', shuffled.column(0))

In [None]:
sampled_stats = make_array()

for i in np.arange(10000):
    shuffled = football.select('Drop').sample(15, with_replacement=False)
    sample = football.select('Team').with_column('Drop', shuffled.column(0))    
    sampled_stats = np.append(sampled_stats, difference_in_average_drop(sample))

In [None]:
Table().with_column('Null distribution', sampled_stats).hist()
_ = plots.plot([observed, observed], [0, 1.4])

In [None]:
np.count_nonzero(sampled_stats >= observed)/len(sampled_stats)