In [None]:
import numpy as np
from datascience import *
from math import *

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Resampling with Replacement

Why do we resample with replacement? How is that different from resample without replacement? 

Recall that, in bootstrapping, we will resample with replacement the same sample size.

In [None]:
# This data are midterm scores from CS61A, Fall 2017
scores = Table().read_table("scores.csv")

mt1_scores = scores.select("MT1 %")
mt1_scores.show(5)

In [None]:
# What is the original mean?
np.mean(mt1_scores.column(0))


In [None]:
# What if we sampled withOUT replacement?
scores_n = scores.num_rows

wo_replacement = mt1_scores.sample(scores_n, with_replacement = False)
wo_replacement.show(5)

In [None]:
# Mean without replacement, same sample size
np.mean(wo_replacement.column(0))

In [None]:
# Now, what if we sample with replacement?
w_replacement = mt1_scores.sample(scores_n, with_replacement = True)
w_replacement.show(5)

In [None]:
# Mean with replacement, same sample size
np.mean(w_replacement.column(0))

## Bootstrapping: What is the average MT1 score?

The `scores` dataset is population-level; in other words, it has all of the information for every student in the class. That isn't good for our purpose, since there is no reason for us to do inferential statistics if we have all of the data. 

(Note: this does not happen very often at all; you'll usually start off with a sample.)

Instead, let's imagine we randomly asked a sample of 60 students what they got on MT1. Then, let's use that sample of 60 to estimate what the whole class got on the exam.

In [None]:
np.random.seed(12345) # So we all get the same data

class_sample = mt1_scores.sample(60, with_replacement = False)
class_sample.show(5)

In [None]:
## Question 1: Let's practice taking a resample with replacement.
one_resample = ...
one_resample

In [None]:
# Did we get some variation?
np.mean(class_sample.column(0)) == np.mean(one_resample.column(0))

In [None]:
# Question 2: Resampling with replacement... many times (let's try 1000!)

def bootstrap_stats(tbl, repetitions)
    """
    tbl = your data, a 1-column table of numerical scores
    repetitions = num bootstrap repetitions/resamples
    """
    # Storage array for each resampled statistic (mean)
    bootstrap_stats = ... 

    # Iterate to create many different resamples, and calculate each individual mean
    for ...:
        ...

    # return the resampled statistic
    return bootstrap_stats

mt1_resampled_scores = bootstrap_stats(class_sample, 2000)

In [None]:
## Question 3: Make a histogram of the resampled statistics
...


In [None]:
## Question 4: Calculate a 95% confidence interval.
# Hint: use the percentile function! It takes in 2 arguments,
# the percent and the array of values

lower_bound = ...
upper_bound = ...

[lower_bound, upper_bound]

## What do we conclude about the MT1 score based on the interval above? 

Type your answer here.


In [None]:
# The true MT1 score:
true_val = np.mean(mt1_scores.column(0))
true_val

In [None]:
## Properties of intervals: what does confidence mean?

def fast_intervals():
    # Take a large, random sample without replacement (SRS)
    our_large_sample = np.random.choice(mt1_scores.column(0), size = 60, replace = False)
    scores = []
    for i in np.arange(1000):
        # Resample with replacement
        sample = np.random.choice(our_large_sample, size = 60, replace = True)
        # Calculate the resample statistic
        scores.append(np.mean(sample))
    # Generate an interval using percentiles
    left, right = np.percentile(scores, 2.5), np.percentile(scores, 97.5)
    return left, right

In [None]:
# Making some intervals and a graph; it'll take a sec
intervals = [fast_intervals() for i in range(100)]
bounds = zip(*intervals)

plt.xlim(min(bounds[0]), max(bounds[1]));
plt.ylim(0, 100);
step = 0
for interval in intervals:
    plt.plot(interval, (step, step), color = "gold");
    step += 4

plt.axvline(true_val, color = "navy");
plt.xlabel("MT1 Score Average");

In [None]:
captured = 0
for i in intervals:
    captured = captured + (i[0] <= true_val <= i[1]) # Did this interval capture the parameter?
    
captured # Number of intervals that got the parameter, out of 100

In [None]:
# Manually creating + checking many intervals can be computationally intensive
# But in theory, if we made 10000 intervals, ? will capture true_val

num_good_intervals = ... 
num_good_intervals

## Hypothesis testing with confidence intervals

Remember from a few weeks ago, we had a hunch that MT2 was significantly more difficult than MT1 in this dataset, and we proved it with a hypothesis test.  

If it was more difficult, this would influence any further analysis we do with the data -- if the means were significantly different, we should not do a simple comparison between exam 1 and exam 2, since our analysis will reflect the difficulty of the exams moreso than an individual's improvement.

So: let's figure out if there was a significant difference! Let's use a different approach - we can use hypothesis testing with a confidence interval. 

In [None]:
# Imagine we have a sample of 60 students, or 2 lab sections. 
np.random.seed(12345);

exams = Table().read_table("scores.csv").sample(60, with_replacement = False)
exams.show(5)

In [None]:
# Let's compare the distributions.
exams.hist(["MT1 %", "MT2 %"])

**In this cell, answer the following:**

What is our test statistic?


What is our null hypothesis?


What is our alternative hypothesis?



In [None]:
# So, was there a difference between the two exams, given (MT2 - MT1)?
# Q1: Calculate the observed statistic from the dataset. 
# Hint -- we did most of this for you already (look at exams)

obs_diff = ...
obs_diff

In [None]:
# Let's go bootstrap! Let's do 1000 repetitions
exam_diffs = exams.select("difference")

resamp_diff_means = bootstrap_stats(exam_diffs, 1000)
resamp_diff_means

In [None]:
# Q3: Calculate a 90% interval for this data. 
# Bonus: what is our p-value cutoff/level of significance?

lower = ...
upper = ...

[lower, upper]

In [None]:
# Q4: Finally, graph the distribution of resampled means.
# use plt (matplotlib.pyplot) to show the bounds of the interval.

...

In [None]:
# Q5: What is our conclusion?
# What does this mean for our data and our analysis?
reject_null = ...
