In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Review: Comparing Two Samples

In [None]:
births = Table.read_table('baby.csv')

In [None]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   

In [None]:
births.select("Maternal Smoker","Birth Weight").group('Maternal Smoker', np.average)

# Randomized Control Experiment

In [None]:
botox = Table.read_table('bta.csv')
botox.show()

How can we easily see how many people were in each category?

In [None]:
botox.pivot('Result', 'Group')

Find the average result for each group

In [None]:
botox.group('Group', np.average)

# Testing the Hypothesis

What's our test statistic here? What is the observed test statistic?

In [None]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

Simulate one test statistic

In [None]:
one_simulated_difference(botox, 'Result', 'Group')

Store 1000 simulated test statistics in an array called `simulated_diffs`

In [None]:
simulated_diffs = make_array()

for i in np.arange(1000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)

Visualize `simulated_diffs`

In [None]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)

Find the p-value

In [None]:
np.count_nonzero(simulated_diffs >= observed_diff)/len(simulated_diffs)