In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## New material

### The case of Robert Swain

In [None]:
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
sample_proportions(100, population_proportions)

In [None]:
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
panels = make_array()

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
panels

In [None]:
Table().with_column(
    'Number of Black men on panel of 100', panels
).hist(bins=np.arange(5.5,40.))

# Plotting details; ignore this code
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);

### Mendel's pea plant model

In [None]:
observed_purples = 709 / 929
observed_purples

In [None]:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    return abs(sample_proportions(929, predicted_proportions).item(0) - 0.75)

In [None]:
purple_flowers()

In [None]:
purples_differences = make_array()

for i in np.arange(10000):
    new_purple_difference = purple_flowers()
    purples_differences = np.append(purples_differences, new_purple_difference)

In [None]:
Table().with_column('Absolute difference if the model is true', purples_differences).hist()

# Plotting details; ignore this code
plots.ylim(-0.02*100, 0.6*100)
plots.scatter(abs(observed_purples -0.75), 0, color='red', s=30);

### Jury selection in Alameda County

*The ACLU compiled data on the composition of the jury panels in 11 felony trials in Alameda County in the years 2009 and 2010. In those panels, the total number of people who reported for jury service was 1453.*

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
    jury.barh('Ethnicity')

Below is the true distribution of people from which the 1453 jurors were randomly sampled.

In [None]:
model = make_array(0.15, 0.18, 0.12, 0.54, 0.01)

In [None]:
simulated = sample_proportions(1453, model)
simulated

In [None]:
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
jury_with_simulated.barh('Ethnicity')

#### We need a new statistic!

In [None]:
diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

**Discussion [1 min]:** How can we best combine/use together the values in the `Difference` column?

In [None]:
sum(jury_with_difference.column('Difference'))

In [None]:
sum(jury_with_difference.where('Difference', are.above(0)).column('Difference'))

In [None]:
sum(abs(jury_with_difference.column('Difference')))/2

#### The Total Variation Distance (**TVD**)

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
simulated_tvd = tvd(sample_proportions(1453, model), jury.column('Eligible'))
simulated_tvd

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1453, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
print('The observed TVD of ' + str(obsvd_tvd) + ' is nowhere close to the simulated TVDs!')

# Plotting details; ignore this code
plots.ylim(-2, 55)
plots.scatter(obsvd_tvd, 0, color='red', s=30);