In [34]:
# Confidence Intervals and Bootstrapping

# To use this tutorial, read the commands and execute the code section by section.

# The learning objective is to understand how to compute confidence 
# intervals (known as credible intervals when using Bayesian approaches),
# which are a fundamental piece of rigorous, quantitative science beacuse
# they describe the uncertainty that you have in estimating some property
# of the world given noisy, finite observations.

# For NGG students, this tutorial is meant to be used in tandem with this
# Discussion on the NGG Canvas site:

#  https://canvas.upenn.edu/courses/1358934/discussion_topics/5464266

# Copyright 2019 by Joshua I. Gold, University of Pennsylvania
# Originally written for Matlab, translated to Python 1.30.20 by CMH

import numpy as np
from scipy.stats import norm
from scipy import stats
from random import choices

# Exercise: Compute confidence/credible intervals for simulated data 
# sampled from a population that is Gaussian distributed with mean mu=10 
# and standard deviation sigma=2, for n=5, 10, 20, 40, 80 at a 
# 95% confidence level.
mu = 10
sigma = 5
alpha = 0.95
NB = 1000 # number of bootstraps

# Loop through the n's
# Note that the different approaches converge on the same answer as n gets large
for n in [5, 10, 20, 40, 80, 160, 1000]: 
   
    #  Simulate some data
    data = np.random.normal(mu, sigma, n)
   
    # Save the mean
    meand = np.mean(data)
   
    # Show the mean, n
    print('N = {0:.2f}, MEAN = {1:.2f}'.format(n, meand))
   
    # METHOD 1: analytic solution assuming Gaussian

    # Get the z-score for the given confidence level (make it negative
    # so we can subtract it to make the lower interval)
    z = norm.ppf((1-alpha)/2)
   
    # 1a. Use the given sigma
    sem = sigma/np.sqrt(n);
    CI_low = meand - (sem*z)
    CI_high = meand + (sem*z)
    print('\t1a: CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))

    # 1b. Use the sample sigma
    # BEST IF n IS LARGE (>30)
    sem = np.std(data)/np.sqrt(n); 
    CI_low = meand - (sem*z)
    CI_high = meand + (sem*z)
    print('\t1b: CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))
   
    # METHOD 2: analytic solution assuming t-distribution
    # BEST IF n IS SMALL (<30) ... note that as n increases, the t
    # distribution approaches a Gaussian and methods 1 and 2 become more
    # and more similar

    # Get the cutoff using the t distribution, which is said to have n-1
    # degrees of freedom
    t = stats.t.ppf((1-alpha)/2, n - 1)  
    sem = np.std(data)/np.sqrt(n);
    CI_low = meand - (sem*t)
    CI_high = meand + (sem*t)
    print('\t2:  CI = [{0:.1f} {1:.2f}]'.format(CI_low, CI_high))
   
    # METHOD 3: bootstrap!

    # Resample the data with replacement to get new estimates of mu 
    # Note that here we do not make any assumptions about the nature of the real distribution.
    for ii in range(NB - 1):
        random_sample = np.random.choice(data, n)
        mu_star[ii] = np.mean(random_sample);
   
    # Now report the CI directly from the bootstrapped distribution
    CI_low =  np.percentile(mu_star, 100*(1-alpha)/2)
    CI_high = np.percentile(mu_star, 100*(alpha+(1-alpha)/2))
    print('\t3:  CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))
   
    # Method 4: Credible interval
    # See the Canvas discussion -- under these assumptions (i.e., data
    # generated from a Gaussian distribution with known sigma), the answer
    # is exactly the same as with Method 1, above. Note that this
    # equivalence is NOT true in general, which means that frequentist
    # confidence intervals and Bayesian credible intervals can give
    # different answers for certain distributions.

N = 5.00, MEAN = 5.02
	1a: CI = [9.40 0.63]
	1b: CI = [9.63 0.41]
	2:  CI = [11.5 -1.51]
	3:  CI = [0.46 9.26]
N = 10.00, MEAN = 8.63
	1a: CI = [11.73 5.53]
	1b: CI = [11.35 5.91]
	2:  CI = [11.8 5.49]
	3:  CI = [6.26 11.45]
N = 20.00, MEAN = 10.96
	1a: CI = [13.15 8.77]
	1b: CI = [12.80 9.11]
	2:  CI = [12.9 8.99]
	3:  CI = [9.02 12.79]
N = 40.00, MEAN = 10.30
	1a: CI = [11.85 8.75]
	1b: CI = [11.79 8.82]
	2:  CI = [11.8 8.77]
	3:  CI = [8.79 11.75]
N = 80.00, MEAN = 9.02
	1a: CI = [10.12 7.92]
	1b: CI = [10.19 7.85]
	2:  CI = [10.2 7.83]
	3:  CI = [7.85 10.38]
N = 160.00, MEAN = 10.48
	1a: CI = [11.26 9.71]
	1b: CI = [11.15 9.82]
	2:  CI = [11.2 9.81]
	3:  CI = [9.87 11.15]
N = 1000.00, MEAN = 10.00
	1a: CI = [10.31 9.69]
	1b: CI = [10.31 9.69]
	2:  CI = [10.3 9.69]
	3:  CI = [9.68 10.32]
