# Lecture 38: Comparing Samples

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline
np.set_printoptions(legacy='1.13')

## Categorical Association

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')

In [None]:
shuffled = patients.sample(with_replacement=False) 
training = shuffled.take(np.arange(341))
test  = shuffled.take(np.arange(341, 683))

In [None]:
training

In [None]:
training_uniformity = training_set.select('Class', 'Uniformity of Cell Size').relabel(1, 'Uniformity')
training_uniformity

In [None]:
training_counts = training_uniformity.pivot('Class', 'Uniformity')
training_counts

In [None]:
def proportions(array):
    """Return an array of proportions."""
    return array/np.sum(array)

def compare(t):
    return t.select(0).with_columns(
        'Benign',    proportions(t.column(1)),
        'Malignant', proportions(t.column(2))    
    ).set_format([1, 2], PercentFormatter)

In [None]:
compare(training_counts)

In [None]:
compare(training_counts).barh(0)

In [None]:
test_set.take(np.arange(4)).column('Uniformity of Cell Size') > 3

In [None]:
classification = test_set.column('Uniformity of Cell Size') > 3

np.count_nonzero(classification == test_set.column('Class'))/test_set.num_rows

In [None]:
False == 0

In [None]:
True == 1

In [None]:
classification = test_set.column('Uniformity of Cell Size') > 0

np.count_nonzero(classification == test_set.column('Class'))/test_set.num_rows

In [None]:
classification = test_set.column('Uniformity of Cell Size') > 10

np.count_nonzero(classification == test_set.column('Class'))/test_set.num_rows

## Comparing two samples

In [None]:
patients.show(3)

In [None]:
mitoses = patients.select('Class', 'Mitoses')

In [None]:
counts = patients.pivot('Class', 'Mitoses')
counts

In [None]:
compare(counts)

In [None]:
compare(counts).barh(0)

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    """Each distribution is an array of proportions that sums to 1."""
    return np.abs(distribution_1 - distribution_2).sum()/2

def table_tvd(table, label_1, label_2):
    return total_variation_distance(table.column(label_1), table.column(label_2))

In [None]:
table_tvd(compare(counts), 1, 2)

In [None]:
mitoses

In [None]:
shuffled_mitoses = mitoses.sample(with_replacement=False).column(1) # Permuted

In [None]:
mitoses.select(0).with_column('Shuffled', shuffled_mitoses) # Paired class with permuted values

In [None]:
shuffled_counts = mitoses.select(0).with_column('Shuffled', shuffled_mitoses).pivot(0, 1)
shuffled_counts

In [None]:
compare(shuffled_counts).barh(0)

In [None]:
table_tvd(compare(shuffled_counts), 1, 2)

In [None]:
repetitions = 5000
tvds = []
for i in np.arange(repetitions):
    shuffled_mitoses = mitoses.sample(with_replacement=False).column(1)
    shuffled_counts = mitoses.select(0).with_column('Shuffled', shuffled_mitoses).pivot(0, 1)
    shuffled_tvd = table_tvd(compare(shuffled_counts), 1, 2)
    tvds.append(shuffled_tvd)

Table().with_column('TVD', tvds).hist(bins=20)
plots.title('Empirical Distribution Under the Null')
print('Observed TVD:', table_tvd(compare(counts), 1, 2))

## Deflategate

In [None]:
football = Table.read_table('football.csv')
football = football.drop('Team')
football.show()

In [None]:
football = football.with_column(
    'Combined', (football.column(1)+football.column(2))/2
    )
football.show()

In [None]:
patriots = football.where('Ball', are.containing('Patriots'))
patriots = patriots.with_column('Drop', 12.5-patriots.column('Combined'))
patriots.show()

In [None]:
colts = football.where('Ball', are.containing('Colts'))
colts = colts.with_column('Drop', 13.0-colts.column('Combined'))
colts

In [None]:
patriots_mean = patriots.column('Drop').mean()
colts_mean = colts.column('Drop').mean()

observed_statistic = patriots_mean - colts_mean
observed_statistic

In [None]:
drops = Table().with_column(
    'Drop', np.append(patriots.column('Drop'), colts.column('Drop'))
)
drops.show()

In [None]:
drops.sample(with_replacement=False).show()

In [None]:
simulated_statistics = []
repetitions = 10000

for i in np.arange(repetitions):
    shuffled = drops.sample(with_replacement=False)
    new_patriots_mean = shuffled.take(np.arange(11)).column(0).mean()
    new_colts_mean = shuffled.take(np.arange(11, drops.num_rows)).column(0).mean()
    new_statistic = new_patriots_mean - new_colts_mean
    simulated_statistics.append(new_statistic)
    
empirical_P = np.count_nonzero(simulated_statistics >= observed_statistic)/repetitions
empirical_P

print('Observed Statistic:', observed_statistic)
print('Empirical P:', empirical_P)
results = Table().with_column('Simulated Statistic', simulated_statistics)
results.hist()
plots.scatter(observed_statistic, 0, color='red', s=30);

## A/B Testing

In [None]:
baby = Table.read_table('baby.csv')
baby

In [None]:
baby.group('Maternal Smoker')

In [None]:
weight_bins = np.arange(40, 181, 5)
baby.where('Maternal Smoker', False).hist('Birth Weight', bins=weight_bins, unit='ounce')

In [None]:
baby.where('Maternal Smoker', True).hist('Birth Weight', bins=weight_bins, unit='ounce')

In [None]:
baby.where('Maternal Smoker', False).bin('Birth Weight', bins=weight_bins).join(
    'bin', baby.where('Maternal Smoker', True).bin('Birth Weight', bins=weight_bins)
).relabeled(1, 'Non-Smoker').relabeled(2, 'Smoker').hist(bin_column='bin')

In [None]:
smoke_weight = baby.select('Maternal Smoker', 'Birth Weight')
means = smoke_weight.group(0, np.mean)
means

In [None]:
def difference_in_means(t):
    means = t.group(0, np.mean)
    return means.column(1).item(0) - means.column(1).item(1)

difference_in_means(smoke_weight)

In [None]:
def permutation_test_means(table, variable, classes, repetitions, unit=None):
    """Test whether two numerical samples 
    come from the same underlying distribution, 
    using the absolute difference between the means.
    table: name of table containing the sample
    variable: label of column containing the numerical variable 
    classes: label of column containing names of the two samples
    repetitions: number of random permutations
    
    variable -- Label for a numerical variable in table
    classes -- Label for a two-category (A & B) variable in table
    """
    
    t = table.select(classes, variable)
    observed = abs(difference_in_means(t))
    
    # Assuming the null is true, randomly permute the variable 
    # and collect all the generated test statistics
    stats = make_array()
    for i in np.arange(repetitions):
        shuffled_var = t.select(variable).sample(with_replacement=False).column(0)
        shuffled = t.select(classes).with_column('Shuffled Variable', shuffled_var)
        new_stat = abs(difference_in_means(shuffled))
        stats = np.append(stats, new_stat)
    
    # Find the empirical P-value:
    p = np.count_nonzero(stats >= observed) / repetitions

    # Draw the empirical histogram of the tvd's generated under the null, 
    # and compare with the value observed in the original sample
    Table().with_column('Test Statistic', stats).hist(unit=unit)
    plots.title('Empirical Distribution Under the Null')
    print('Observed statistic:', observed)
    print('Empirical P-value:', p)

In [None]:
permutation_test_means(baby, 'Birth Weight', 'Maternal Smoker', 1000, 'ounce')

## Effect Size

In [None]:
def bootstrap_ci_means(table, variable, classes, repetitions, unit=None):
    """Bootstrap approximate 95% confidence interval
    for the difference between the means of the two classes
    in the population
    """
    
    t = table.select(classes, variable)
    
    stats = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = t.sample()
        new_stat = difference_in_means(bootstrap_sample)
        stats = np.append(stats, new_stat)
        
    left = percentile(2.5, stats)
    right = percentile(97.5, stats)
    
    Table().with_column('Difference Between Means', stats).hist(unit=unit)
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8)
    print('Approximate 95% CI for the difference between means:')
    print(left, 'to', right)

In [None]:
bootstrap_ci_means(baby, 'Birth Weight', 'Maternal Smoker', 1000, 'ounce')

In [None]:
bootstrap_ci_means(baby, 'Maternal Age', 'Maternal Smoker', 1000, 'year')

In [None]:
bootstrap_ci_means(baby, 'Maternal Height', 'Maternal Smoker', 1000, 'inch')

## RCTs

In [None]:
bta = Table.read_table('bta.csv')
bta

In [None]:
bta.group('Group', np.mean)

In [None]:
permutation_test_means(bta, 'Result', 'Group', 1000)

In [None]:
bootstrap_ci_means(bta, 'Result', 'Group', 1000)