In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## The Toast Myth

We saw the Mythbusters crew do an experiment with 48 pieces of toast, where 29 landed butter side up and 19 butter side down.  Let's see if we can figure out how likely this outcome would be, if toast was equally likely to land on either side.  In particular, we'll play a "what-if" game: what if toast was equally likely to land on both sides?  Let's simulate what would happen, under that assumption.

In [None]:
possible_outcomes = Table.read_table('toast_possible.csv')

In [None]:
possible_outcomes

In [None]:
simulated_experiment = possible_outcomes.sample(48)

In [None]:
simulated_experiment

In [None]:
simulated_experiment.group('Outcome')

In [None]:
number_up = simulated_experiment.group('Outcome').where('Outcome', 'Butter Side Up').column('count')[0]

In [None]:
number_up

## Repeated simulation

Now we're going to repeat the simulation 10000 times, and keep track of the statistic (the count of butter-side-up) we get from each simulation.

In [None]:
counts = make_array()
for i in np.arange(10000): # 10000 repetitions
    one_simulation = possible_outcomes.sample(48)
    number_up = one_simulation.group('Outcome').where('Outcome', 'Butter Side Up').column('count')[0]
    counts = np.append(counts, number_up)
results = Table().with_column('Number that landed butter-side-up', counts)

In [None]:
results

In [None]:
results.hist(bins=np.arange(12,36,1))

In [None]:
results.where('Number that landed butter-side-up', are.above_or_equal_to(29)).num_rows / 10000

In [None]:
results.where('Number that landed butter-side-up', are.not_between(20, 29)).num_rows / 10000

## Swain v Alabama

In [None]:
swain = Table().with_columns(
    'Ethnicity', make_array('Black', 'Other'),
    'Eligible', make_array(0.26, 0.74),
    'Panel', make_array(0.08, 0.92)
)

swain.set_format('Eligible', PercentFormatter(0))
swain.set_format('Panel', PercentFormatter(0))

In [None]:
swain.barh('Ethnicity')

## Total Variation Distance

In [None]:
diff = swain.with_column('Difference', swain.column('Eligible') - swain.column('Panel'))
diff

In [None]:
abs_diff = diff.with_column('Abs. Difference', np.abs(diff.column('Difference')))
abs_diff

In [None]:
sum(abs_diff.column('Difference')) / 2

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    """Each distribution is an array of proportions that sums to 1."""
    return np.abs(distribution_1 - distribution_2).sum()/2

In [None]:
def table_tvd(table, label_1, label_2):
    return total_variation_distance(table.column(label_1), table.column(label_2))

In [None]:
table_tvd(swain, 'Eligible', 'Panel')

## Simulating the statistic, for Swain v. Alabama

In [None]:
swain.sample(10)

In [None]:
ethnicity = swain.select('Ethnicity')
ethnicity

In [None]:
ethnicity.sample(10)

In [None]:
population_distribution = swain.column('Eligible')
population_distribution

In [None]:
ethnicity.sample(10, weights=population_distribution)

In [None]:
panel_size = 100
swain.select('Ethnicity').sample(panel_size, weights=swain.column('Eligible'))

In [None]:
panel = swain.select('Ethnicity').sample(panel_size, weights=swain.column('Eligible'))
counts = panel.group('Ethnicity')
sample_proportions = counts.select('Ethnicity').with_column('Random', counts.column('count') / panel_size)
sample_proportions.set_format('Random', PercentFormatter(0))

In [None]:
swain

In [None]:
swain.join('Ethnicity', sample_proportions)

In [None]:
def random_jury_panel():
    panel = swain.select('Ethnicity').sample(panel_size, weights=swain.column('Eligible'))
    counts = panel.group('Ethnicity')
    sample_proportions = counts.select('Ethnicity').with_column('Random', counts.column('count') / panel_size)
    sample_proportions.set_format('Random', PercentFormatter(0))
    return swain.join('Ethnicity', sample_proportions)

In [None]:
random_jury_panel()

## Repeating the simulation multiple times, for Swain v. Alabama

In [None]:
# Compute the empirical distribution of TVDs

tvds = make_array()

for i in np.arange(10000): # Repetitions
    new_sample = random_jury_panel()
    tvds = np.append(tvds, table_tvd(new_sample, 'Eligible', 'Random'))

results = Table().with_column('TVD between the population & a random sample', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.20, 0.005))

In [None]:
results.hist(bins=np.arange(0, 0.06, 0.005))

## Alameda County Juries

In [None]:
# Data from an ACLU 2010 report
# Racial and Ethnic Disparities in Alameda County Jury Pools
# https://www.aclunc.org/sites/default/files/racial_and_ethnic_disparities_in_alameda_county_jury_pools.pdf

panels = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

panels.set_format('Eligible', PercentFormatter(0))
panels.set_format('Panels', PercentFormatter(0))

In [None]:
panels.barh('Ethnicity')

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    """Each distribution is an array of proportions that sums to 1."""
    return np.abs(distribution_1 - distribution_2).sum()/2

def table_tvd(table, label_1, label_2):
    return total_variation_distance(table.column(label_1), table.column(label_2))

table_tvd(panels, 'Eligible', 'Panels')

In [None]:
Table().with_columns('side', ['heads', 'tails']).sample(1000, weights=[.8, .2]).group('side')

In [None]:
def sample_from_distribution(t, category_label, distribution_label, sample_size):
    categories = t.select(category_label)
    distribution = t.column(distribution_label)
    return categories.sample(sample_size, weights=distribution)

sample_from_distribution(panels, 'Ethnicity', 'Eligible', 10)

In [None]:
def with_random_sample(t, category_label, distribution_label, sample_size):
    """Return t with a column that contains category proportions of a random sample."""
    sample = sample_from_distribution(t, category_label, distribution_label, sample_size)
    counts = sample.group(category_label)
    sample_proportions = counts.select(category_label).with_column('Random', counts.column('count') / sample_size)
    sample_proportions.set_format('Random', PercentFormatter(0))
    return t.join(category_label, sample_proportions)

with_random_sample(panels, 'Ethnicity', 'Eligible', 1453)

## Repeated simulations, for Alameda County

In [None]:
# Compute the empirical distribution of TVDs

tvds = make_array()

for i in np.arange(1000): # Repetitions
    new_sample = with_random_sample(panels, 'Ethnicity', 'Eligible', 1453)
    tvds = np.append(tvds, table_tvd(new_sample, 'Eligible', 'Random'))

results = Table().with_column('TVD between the population & a random sample', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.2, 0.01))