BerkeleyX: Data8.2x

Foundations of Data Science: Inferential Thinking by Resampling

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lec 6.1 Introduction

In [None]:
# Jury Selection in Alameda County
jury = Table().with_columns(
    'Ethnicity', ['Asian', 'Black', 'Latino', 'White', 'Other'],
    'Eligible', [0.15, 0.18, 0.12, 0.54, 0.01],
    'Panels', [0.26, 0.08, 0.08, 0.54, 0.04]
)
jury

In [None]:
# there are differences between eligibility and actual panels
# are panels really selected at random?
jury.barh('Ethnicity')

Lec 6.2 Total Variation Distance

In [None]:
# how to evaluate the distance between two distributions?
# have a look at differences
jury_with_diffs = jury.with_column('Difference', jury.column('Panels') - jury.column('Eligible'))
jury_with_diffs

In [None]:
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', np.abs(jury_with_diffs.column('Difference')))
jury_with_diffs

# distance between distribution = Total Variation Distance
print(sum(jury_with_diffs.column('Absolute Difference')) / 2)

In [None]:
def total_variation_distance(distribution_1, distribution_2):
    return sum(np.abs(distribution_1 - distribution_2)) / 2

In [None]:
total_variation_distance(jury.column('Eligible'), jury.column('Panels'))

Lec 6.3 Assessment

In [None]:
# distribution of eligible population
eligible = jury.column('Eligible')

In [None]:
# use sample_proportions to get random samples from our distribution
#  datascience.sample_proportions(sample_size, probabilities)
#  (just a wrapper to np.random.multinomial(sample_size, probabilities) / sample_size)
panels_and_sample = jury.with_column('Random Sample', sample_proportions(1453, eligible))
panels_and_sample

In [None]:
panels_and_sample.barh('Ethnicity')

In [None]:
# tvd from a random sample compared to the actual one
print(total_variation_distance(panels_and_sample.column('Random Sample'), eligible))
print(total_variation_distance(jury.column('Panels'), eligible))

In [None]:
# let's check ten thousand random samples
tvds = []

repetitions = 10000
for i in range(repetitions):
    sample_distribution = sample_proportions(1453, eligible)
    tvds.append(total_variation_distance(sample_distribution, eligible))

Table().with_column('Total Variation Distance', tvds).hist(
    bins = np.arange(0, 0.09, 0.002), ec='w', normed=None, density=True)

# the actual tvd is way out of the expected range

Lec 6.4 Summary