In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Review: Comparing Two Samples

In [None]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   

In [None]:
births = Table.read_table('baby.csv')
births

In [None]:
births.select('Maternal Smoker', 'Birth Weight').group('Maternal Smoker', np.average)

In [None]:
observed_difference = difference_of_means(births,'Birth Weight', 'Maternal Smoker')
observed_difference

In [None]:
#Can we use our function above to simulate 1 difference?
#one_simulated_difference(births,'Birth Weight', 'Maternal Smoker')
#How about repeating 1000 times?
differences = make_array()
for i in np.arange(1000):
    next_difference = one_simulated_difference(births,'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, next_difference)
differences


In [None]:
#To conclude something from this test, let's visualize
diff_Table = Table().with_column('Simulated average difference', differences)
diff_Table.hist(bins = 20)
plots.scatter(observed_difference, 0, color='red', s=40, zorder=3)
plots.title('Prediction Under the Null Hypothesis')
print('Observed Difference:', observed_difference)

In [None]:
#What did we conclude?

# Randomized Control Experiment

A randomized controlled trial (RCT) examined the effect of using Botulinum Toxin A (BTA) as a treatment (https://pubmed.ncbi.nlm.nih.gov/11376175/). Botulinum toxin is a neurotoxic protein that causes the disease botulism; Wikipedia says that botulinum “is the most acutely lethal toxin known.” There are seven types of botulinum toxin. Botulinum Toxin A is one of the types that can cause disease in humans, but it is also used in medicine to treat various diseases involving the muscles. The RCT analyzed by Foster, Clapp, and Jabbari in 2001 examined it as a treatment for low back pain.

In [None]:
botox = Table.read_table('bta.csv')
botox.show()

In [None]:
#Let's look at the results by looping at the 2 groups (control/treatment)
botox.group('Group')
#Which table methods can we use to look at the counts?

In [None]:
#Instead of looking at counts, 
#can we look at average (since values are 1 or 0, this should equal the proportion)
botox.group('Group', np.average)

# Testing the Hypothesis

In [None]:
#What test statistic should we use?
observed_diff = abs(difference_of_means(botox,'Result','Group'))
observed_diff

In [None]:
#Let's do a permutation test/shuffle -> one simulation of the test statistic
abs(one_simulated_difference(botox,'Result', 'Group'))

In [None]:
#How about repeating it 10000 times
abs_diff = make_array()
for i in np.arange(10000):
    next_diff = abs(one_simulated_difference(botox,'Result', 'Group'))
    abs_diff = np.append(abs_diff, next_diff)

In [None]:
#What do we need to do to visualize the result?
Table().with_column('Average distance', abs_diff).hist()
plots.scatter(observed_diff, 0, color='red', s=40, zorder=3)
plots.title('Prediction Under the Null Hypothesis')
print('Observed Distance:', observed_diff)

In [None]:
# Can we calculate the p-value? What does this suggest?
pValue = 100 *np.count_nonzero(abs_diff >= observed_diff)/len(abs_diff)
pValue