In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Alameda County Jury Panels ##

In [None]:
panel_size = 1453
ethnicities = make_array('Asian', 'Black', 'Latino', 'White', 'Other')
eligible = make_array(0.15, 0.18, 0.12, 0.54, 0.01)
panels = make_array(0.26, 0.08, 0.08, 0.54, 0.04)

## Visualize ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', ethnicities,
    'Eligible', eligible,
    'Panels', panels
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
sample_distribution = sample_proportions(panel_size, eligible)
panels_and_sample = jury.with_column('Random Sample', sample_distribution)

In [None]:
panels_and_sample

In [None]:
panels_and_sample.barh('Ethnicity')

## Start Quantifying ##

In [None]:
jury_with_diffs = jury.with_column(
    'Difference', jury.column('Panels') - jury.column('Eligible')
)

In [None]:
jury_with_diffs

In [None]:
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', abs(jury_with_diffs.column('Difference'))
)

In [None]:
jury_with_diffs

In [None]:
sum(jury_with_diffs.column('Absolute Difference'))

In [None]:
sum(jury_with_diffs.column('Absolute Difference')) / 2

In [None]:
# Total Variation Distance

def tvd(distribution_1, distribution_2):
    return sum(abs(distribution_1 - distribution_2)) / 2

In [None]:
observed_tvd = tvd(eligible, panels)
observed_tvd

In [None]:
tvd(eligible, panels_and_sample.column('Random Sample'))

## Sampling Variability ##

In [None]:
# Another random sample

sample_distribution = sample_proportions(panel_size, eligible)
tvd(sample_distribution, eligible)

In [None]:
# Generate one value of the statistic
# under the assumption of random selection

def one_tvd_under_random_draws():
    sample_distribution = sample_proportions(panel_size, eligible)
    return tvd(sample_distribution, eligible)

In [None]:
# Now generate 10,000 of these

tvds = make_array()

for i in np.arange(10000):
    tvds = np.append(tvds, one_tvd_under_random_draws())   

In [None]:
Table().with_column('Total Variation Distance', tvds).hist(bins = 20)

In [None]:
observed_tvd