In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 12
## Distribution of M&M Colors

In [None]:
candies = Table().with_columns("Color", make_array('red', 'orange', 'yellow', 'green', 'blue', 'brown'),
                               "Theoretical", make_array(0.13, 0.2, 0.14, 0.16, 0.24, 0.13),
                               "Sample", make_array(0.08 , 0.245 , 0.198, 0.198, 0.17, 0.109))
candies

In [None]:
# Plotting details
candies.barh("Color")

In [None]:
# Under the model, this is the true distribution of colors
# from which the M&Ms are randomly sampled
model = make_array(0.13, 0.2, 0.14, 0.16, 0.24, 0.13)

In [None]:
# Let's simulate a random draw of 132 M&Ms from this distribution
simulated = sample_proportions(132, model)
simulated

In [None]:
# The actual observed distribution (Sample) looks quite different
# from the simulation -- try running this several times to confirm!
candies_with_simulated = candies.with_column('Simulated', simulated)
candies_with_simulated

In [None]:
candies_with_simulated.barh('Color')

## Distance Between Distributions

In [None]:
# In the last lecture, the difference between observed purples
# and the expected values (75%) was our statistic.
#
# In this case, we need to understand how each of the 6 categories
# differ from their expected values according to the model.

diffs = candies.column('Sample') - candies.column('Theoretical')
candies_with_difference = candies.with_column('Difference', diffs)
candies_with_difference

## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Sample) from their expected values
# assuming the model is true (Theoretical)
obsvd_tvd = tvd(candies.column('Sample'), candies.column('Theoretical'))
obsvd_tvd

In [None]:
# The TVD of a model simluation from its expected values
tvd(sample_proportions(132, model), candies.column('Theoretical'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(123, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
Table().with_column(title, tvds).hist()
print('Observed TVD: ' + str(obsvd_tvd))

# The GSI's Defense

In [None]:
scores = Table.read_table('scores_by_section.csv')
scores

In [None]:
scores.group('Section')

In [None]:
scores.group('Section', np.average).show()

In [None]:
observed_average = 13.6667 

In [None]:
random_sample = scores.sample(27, with_replacement=False)
random_sample

In [None]:
np.average(random_sample.column('Midterm'))

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that the section is like a random sample from the class

def random_sample_midterm_avg():
    random_sample = scores.sample(27, with_replacement = False)
    return np.average(random_sample.column('Midterm'))

In [None]:
# Simulate 50,000 copies of the test statistic

sample_averages = make_array()

for i in np.arange(50000):
    sample_averages = np.append(sample_averages, random_sample_midterm_avg())    

## Making a Decision

In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic
averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins = 20)
plots.scatter(observed_average, -0.01, color='red', s=120);

## Approach 1

In [None]:
# (1) Calculate the p-value: simulation area beyond observed value
np.count_nonzero(sample_averages <= observed_average) / 50000
# (2) See if this is less than 5%

### Approach 2

In [None]:
# (1) Find simulated value corresponding to 5% of 50,000 = 2500
five_percent_point = averages_tbl.sort(0).column(0).item(2500)
five_percent_point

In [None]:
# (2) See if this value is greater than observed value
observed_average

### Visual Representation

In [None]:
averages_tbl.hist(bins = 20)
plots.plot([five_percent_point, five_percent_point], [0, 0.35], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');
plots.scatter(observed_average, -0.01, color='red', s=120);