# Lecture 17: Simulation

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

### Structure of simulation

**Purpose:** simulate sampling from a population, computing a statistic, and visualizing the bias and variability of that statistic

In [None]:
# ###########################
# # Template for a simulation
# # (This cell won't run.)
# ###########################

# # Create table to represent population
# population = Table().with_column('data', make_array(...))

# # Create empty array to accumulate statistics
# statistics = make_array()

# # "Tuning knobs" for simulation
# repetitions = ...
# sample_size = ...

# for _ in np.arange(repetitions):
#     sample = population.sample(sample_size).column('data')
#     statistic = ... # computed from sample
#     statistics = np.append(statistics, statistic)

# statistics

## Pea Flowers

In [None]:
pea_colors = make_array('Purple', 'Purple', 'Purple', 'White')
pea_colors

**Question.** Assume Mendel's theories are correct.  The proportion of purple plants is .75.  What kind of number is that?

A. Population parameter  
B. Population statistic  
C. Sample parameter  
D. Sample statistic  
E. I don't know

<br/><br/><br/><br/><br/>

In [None]:
sample_size = 929

def get_sample():
    return Table().with_column('color', pea_colors).sample(sample_size).column('color')

In [None]:
get_sample()

In [None]:
def proportion_purple(sample):
    return np.sum(sample == 'Purple') / len(sample)

In [None]:
proportion_purple(get_sample())  # sample statistic

**Discussion question.** Every time we run the cell above, we get a different statistic.  How variable is that statistic?  

<br/><br/><br/><br/><br/><br/><br/><br/>

### Another way to sample

In [None]:
pea_colors = Table().with_column('color', make_array('Purple', 'White'))
pea_colors

In [None]:
sample = pea_colors.sample(929, weights = make_array(.75, .25))
proportion_purple(sample.column('color'))

### Simulation

In [None]:
# Create table to represent population
peas = Table().with_columns(
    'color', make_array('Purple', 'White'),
    'chance', make_array(.75, .25)
)
peas

In [None]:
# Create empty array to accumulate statistics
proportions = make_array()

# "Tuning knobs" for simulation
repetitions = 5000
sample_size = 929

for _ in np.arange(repetitions):
    sample = peas.sample(sample_size, weights=peas.column('chance')).column('color')
    statistic = proportion_purple(sample)
    proportions = np.append(proportions, statistic)

proportions

In [None]:
percents = Table().with_column('Plants', proportions * 100)
percents.hist(bins=np.arange(70, 80, 1), unit='percent that are purple')

## Comparing to observed data

In [None]:
observed_purples = 705
total_plants = 929
observed_percent_purple = 100 * observed_purples / total_plants
observed_percent_purple

In [None]:
percents.hist(bins=np.arange(70, 80, 1), unit='percent that are purple')
plots.scatter(observed_percent_purple, 0, color='red', s=100);

## A different statistic

In [None]:
def mystery_statistic(sample):
    return abs(np.sum(sample == 'Purple') / len(sample) - 0.75)

**Question.** Describe that statistic in your own words.

<br/><br/><br/><br/><br/>

In [None]:
repetitions = 5000
sample_size = 929

statistics = make_array()

for _ in np.arange(repetitions):
    sample = peas.sample(sample_size, weights=peas.column('chance')).column('color')
    statistic = mystery_statistic(sample)
    statistics = np.append(statistics, statistic)

statistics

In [None]:
observed_statistic = abs(observed_purples / total_plants - 0.75)
observed_statistic

In [None]:
pred = Table().with_column('Statistic', statistics)
pred.hist()
plots.scatter(observed_statistic, 0, color='red', s=100);

## Yet another statistic

**Question.** What do you expect the histogram produced by the simulation below to look like?

In [None]:
###############################################################
# A new statistic

def number_of_different_colors(sample):
    """
    Returns the number of different colors in sample.
    For example, would return 2 on the input 
      make_array('Purple', 'White').
    """
    return Table().with_column('color', sample).group('color').num_rows

###############################################################
# Everything below here is the same simulation we've been doing 

repetitions = 5000
sample_size = 929

statistics = make_array()

for _ in np.arange(repetitions):
    sample = peas.sample(sample_size, weights=peas.column('chance')).column('color')
    statistic = number_of_different_colors(sample)
    statistics = np.append(statistics, statistic)

###############################################################
# Plot a histogram of the simulated statistic
Table().with_column('Statistic', statistics).hist()

## Swain v. Alabama

In [None]:
swain = Table().with_columns(
    'Race', make_array('Black', 'Other'),
    'Eligible', make_array(0.26, 0.74),
    'Panel', make_array(0.08, 0.92)
)

swain.set_format([1, 2], PercentFormatter(0))

In [None]:
swain.barh('Race')

## Simulating

In [None]:
swain

**Question.** How could we write code to produce a new randomly sampled panel of 100 jurors?  Specifically, their race?  E.g., `array(['Black', 'Other', ..., 'Black'])`, with 100 elements in that array?

<br/><br/><br/><br/><br/>

In [None]:
random_panel = swain.select('Race').sample(100, weights=swain.column('Eligible'))
random_panel.column('Race')

In [None]:
# Let's summarize that panel
counts = random_panel.group('Race')
counts

In [None]:
proportions = counts.select('Race').with_column(
    'Random Panel', counts.column('count') / 100
)
proportions.set_format('Random Panel', PercentFormatter(0))

In [None]:
# For sake of comparison
swain.join('Race', proportions).barh('Race')

**Question.** Does the random panel look more like the actual panel?  Or more like the eligible population?

In [None]:
def random_jury_panel():   
    panel_size = 100
    panel = swain.select('Race').sample(panel_size, weights=swain.column('Eligible'))
    counts = panel.group('Race')
    proportions = counts.select('Race').with_column(
        'Random Panel', counts.column('count') / panel_size
    )
    proportions.set_format('Random Panel', PercentFormatter(0))
    return proportions

random_jury_panel()

In [None]:
# Simulate 

proportion_black = make_array()

for _ in np.arange(1000): 
    sample = random_jury_panel()
    p = sample.where('Race', 'Black').column('Random Panel').item(0)
    proportion_black = np.append(proportion_black, p)

results = Table().with_column('Proportion black', proportion_black)
results

In [None]:
np.set_printoptions(legacy='1.13')

results.hist(bins = np.arange(0, .51, .01))
plots.scatter(.08, 0, color='red', s=100);