In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 17 ##

## Discussion questions ##

Data: the results of 400 tosses of a coin </br>
Question (a) ●“This coin is fair.”</br>
             ● “No, it’s not.”</br>
Question (b) ●“This coin is fair.”</br>
             ● “No, it’s biased towards heads.”</br>

In [None]:
coins = make_array(0.5, 0.5)

In [None]:
simulated = sample_proportions(400, coins)
simulated

In [None]:
def simulate_coin(size):
    return sample_proportions(size, coins)

In [None]:
head_result = make_array()
tail_result = make_array()
coin_time = 10000
for i in np.arange(coin_time):
   
    new_head = simulate_coin(400).item(0)
    new_tail = 1 - new_head
    head_result = np.append(head_result, new_head)
    tail_result = np.append(tail_result, new_tail)

In [None]:
# problem (a)
# check how far the simulation result from the 50% (fair chance of head)
Table().with_column('the discrepency of head - 50%', abs(head_result - 0.5)).hist()

In [None]:
# problem (b)
# check if the simulation with the head_result close to 50% (fair chance of head)
Table().with_column('head %', head_result).hist()

In [None]:
# problem (b)
# check the difference between heads and tails (if heads - tails > 0 which is favor to head)
# the result is pretty identical to the previous cell
Table().with_column('the difference between heads and tails (heads - tails)', head_result - tail_result).hist()

## Alameda County Jury Panels ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
# Under the model, this is the true distribution of people
# from which the jurors are randomly sampled
model = make_array(0.15, 0.18, 0.12, 0.54, 0.01)

In [None]:
# Let's simulate a random draw of 1423 jurors from this distribution
simulated = sample_proportions(1423, model)
simulated

In [None]:
# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running this several times to confirm!
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
jury_with_simulated.barh('Ethnicity')

## Distance Between Distributions

In [None]:
# In the last lecture, the difference between observed black/purple
# and their expected values (26%/75%) was our statistic.
#
# In this case, we need to understand how each of the 5 categories
# differ from their expected values according to the model.

diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
# The TVD of a model simluation from its expected values
tvd(sample_proportions(1423, model), jury.column('Eligible'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1423, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
print('Observed TVD: ' + str(obsvd_tvd))