In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Alameda County Juries

In [None]:
# Data from an ACLU 2010 report
# Racial and Ethnic Disparities in Alameda County Jury Pools
# https://www.aclunc.org/sites/default/files/racial_and_ethnic_disparities_in_alameda_county_jury_pools.pdf

alameda = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

alameda.set_format([1, 2], PercentFormatter(0))

In [None]:
alameda.barh(0)

## Total Variation Distance

In [None]:
diff = alameda.with_column('Difference',
                    alameda.column('Eligible') - alameda.column('Panels'))
diff

In [None]:
abs_diff = diff.with_column('Abs. Difference',
                        np.abs(diff.column('Difference')))
abs_diff

In [None]:
sum(abs_diff.column('Abs. Difference')) / 2

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    """Each distribution is an array of proportions that sums to 1."""
    return np.abs(distribution_1 - distribution_2).sum()/2

In [None]:
def table_tvd(table, label_1, label_2):
    return total_variation_distance(table.column(label_1), table.column(label_2))

In [None]:
observed = table_tvd(alameda, 'Eligible', 'Panels')
observed

## Simulating the statistic

In [None]:
def get_one_simulated_panel(t, n):
    return t.select('Ethnicity').sample(n, weights=t.column('Eligible'))

get_one_simulated_panel(alameda, 1453)

In [None]:
def simulate_once(t, n):
    simulated_panel = get_one_simulated_panel(t, n)
    counts = simulated_panel.group('Ethnicity')
    sim_proportions = counts.select('Ethnicity').with_column('Random',
                                    counts.column('count') / n)
    sim_proportions.set_format(1, PercentFormatter(0))
    return t.join('Ethnicity', sim_proportions)

In [None]:
simulate_once(alameda, 1453)

In [None]:
# Compute the empirical distribution of TVDs

tvds = make_array()

for i in np.arange(1000): # 1000 repetitions of the simulation
    sim_results = simulate_once(alameda, 1453)
    tvds = np.append(tvds, table_tvd(sim_results, 'Eligible', 'Random'))

results = Table().with_column('TVD', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.2, 0.005))

## P-value

In [None]:
results.where('TVD', are.above_or_equal_to(0.14)).num_rows / results.num_rows

## Addendum: Alameda County Race & Ethnicity Distribution

In [None]:
# According to the 2010 Census, https://www.census.gov/2010census/popmap/

alameda_race = Table(['Race', 'Population']).with_rows([
    ['White', 649122],
    ['African American', 190451],
    ['Asian', 394560],
    ['AIAN', 9799],
    ['NHPI', 12802],
    ['Some Other Race', 162540],
    ['Two or more Races', 90997],
])

alameda_race.set_format(1, DistributionFormatter).show()

alameda_ethnicity = Table(['Ethnicity', 'Population']).with_rows([
    ['Hispanic or Latino', 339889],
    ['Not Hispanic or Latino', 1170382],
])

alameda_ethnicity.set_format(1, DistributionFormatter).show()

In [None]:
alameda

## P-Value

In [None]:
results.where(0, are.above_or_equal_to(observed)).num_rows / results.num_rows

## Swain v. Alabama, repeat

In [None]:
swain = Table().with_columns(
    'Ethnicity', make_array('Black', 'Other'),
    'Eligible', make_array(0.26, 0.74),
    'Panel', make_array(0.08, 0.92)
)

swain.set_format([1, 2], PercentFormatter(0))

In [None]:
swain.barh(0)

In [None]:
simulate_once(swain, 100)

In [None]:
# Compute the empirical distribution of TVDs

tvds = make_array()

for i in np.arange(10000): # 10000 repetitions of the simulation
    sim_results = simulate_once(swain, 100)
    tvds = np.append(tvds, table_tvd(sim_results, 'Eligible', 'Random'))

results = Table().with_column('TVD between the population & a random sample', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.20, 0.010))

# Birth months

What month were you born in?

* A) Jan-Mar
* B) Apr-Jun
* C) Jul-Sep
* D) Oct-Dec

In [None]:
birth_month = Table().with_columns(
    "Month", make_array("Jan-Mar", "Apr-Jun", "Jul-Sep", "Oct-Dec"),
    "Count", make_array(5,5,5,5))
birth_month

In [None]:
size_of_class = sum(birth_month.column("Count"))
observed = sum(abs(birth_month.column("Count")/size_of_class - .25))
observed

How likely is this distribution of birth months?

In [None]:
random_counts = birth_month.select("Month").sample(size_of_class).group("Month")
random_counts

In [None]:
tvds = make_array()
for i in np.arange(10000): # 10000 repetitions
    random_counts = birth_month.select("Month").sample(size_of_class).group("Month").column("count")
    tvd = sum(abs(random_counts/sum(random_counts) - .25))
    tvds = np.append(tvds, tvd)
    
results = Table().with_column('TVD', tvds)

In [None]:
results

In [None]:
results.hist()

In [None]:
results.where('TVD', are.above_or_equal_to(observed)).num_rows / 10000