In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Comparing Two Samples

First, let's read in the data

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births

Let's look at the columns 'Maternal Smoker' (whether the mother smoked during pregnancy) and the 'Birth Weight'

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

How many people were in each group?

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

Let's visualize the distribution!

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

# Test Statistic

What's the average of each group?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

What's the value of our test statistic?

Group B average weight - Group A average weight 

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

Write a function that takes the name of table, column label of numerical variable and column label of group-label variable, and returns the difference of means of the two groups

In [None]:
def difference_of_means(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups"""
    
    #table with the two relevant columns
    reduced = table.select(label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

Use this function to generate the test statistic with any table with the two labels

In [None]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))

In [None]:
letters.sample()

In [None]:
letters.sample(with_replacement = False)

In [None]:
letters.with_column('Shuffled', letters.sample(with_replacement = False).column(0))

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

Shuffle the labels of the "Maternal Smoker" column

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')
shuffled_labels

Note: we need to do .column to get the shuffled labels

Let's add the shuffled labels into our original table

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

What's our simulated test statistic?

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

Compare to our observed test statistic:

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

# Permutation Test

Write a function that returns the difference of means of the two groups after shuffling labels.

1. Get an array of shuffled labels
2. Add shuffled labels to your table
3. Calculate the test statistic for this simulation

In [None]:
def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

Let's do this 2500 times!

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

What's the p-value?

In [None]:
p_value = np.count_nonzero(differences <= observed_difference)/len(differences)
p_value

Compare this to a 1% p-value cutoff:

In [None]:
p_value < 0.01