## In-class notebook: 2025-01-22

In this notebook, we will look at some common usages of classical statistical inference. We will first look at common ways we empirically estimate error bars, and then look at hypothesis testing and ways to compare distributions. 

This notebook is intended to support Chapter 4.5-4.9 of the textbook, and material is taken from the following scripts (from astroML):

* https://github.com/astroML/astroML-notebooks/blob/main/chapter4/astroml_chapter4_Confidence_estimates.ipynb
* https://github.com/astroML/astroML-notebooks/blob/main/chapter4/astroml_chapter4_Hypothesis_testing.ipynb
* https://github.com/astroML/astroML-notebooks/blob/main/chapter4/astroml_chapter4_Comparison_of_distributions.ipynb

In [None]:
import numpy as np
from matplotlib import pyplot as plt

from IPython.display import HTML

HTML('''
<script>
function toggleCell(cellIndex) {
    var cell = Jupyter.notebook.get_selected_cell();
    var element = cell.element.find('.inner_cell');
    element.toggle();
}
</script>
''')

## Confidence estimation: Bootstrap

In [None]:
from scipy.stats import norm
from astroML.resample import bootstrap
from astroML.stats import sigmaG

m = 1000  # number of points
n = 10000  # number of bootstraps

# sample values from a normal distribution
np.random.seed(123)
data = norm(0, 1).rvs(m)

# Compute bootstrap resamplings of data
mu1_bootstrap = bootstrap(data, n,  np.std, kwargs=dict(axis=1, ddof=1))
mu2_bootstrap = bootstrap(data, n, sigmaG, kwargs=dict(axis=1))

In [None]:
# Compute the theoretical expectations for the two distributions
x = np.linspace(0.8, 1.2, 1000)

# error on the estimation of sigma from bootstrap
sigma1 = 1. / np.sqrt(2 * (m - 1))
pdf1 = norm(1, sigma1).pdf(x)

# error on the estimation of sigmaG from bootstrap
sigma2 = 1.06 / np.sqrt(m) 
pdf2 = norm(1, sigma2).pdf(x)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

ax.hist(mu1_bootstrap, bins=50, density=True, histtype='step',
        color='blue', ls='dashed', label=r'$\sigma\ {\rm (std. dev.)}$')
ax.plot(x, pdf1, color='gray')

ax.hist(mu2_bootstrap, bins=50, density=True, histtype='step',
        color='red', label=r'$\sigma_G\ {\rm (quartile)}$')
ax.plot(x, pdf2, color='gray')

ax.set_xlim(0.82, 1.18)

ax.set_xlabel(r'$\sigma$', fontsize = 16)
ax.set_ylabel(r'$p(\sigma|x,I)$', fontsize = 16)

ax.legend()

<div class="alert alert-info" style="color:black">
<h3>Exercise</h3>
<hr>
If you want to code this up from scratch, how would you do it?
</div>



In [None]:
# your answer here





In [None]:
def bootstrap_std(data, n):
    N = len(data)
    resample = []
    for i in range(n):
        choices = np.random.choice(data, size=N, replace=True)
        resample.append(np.std(choices))
        
    return resample

# y = bootstrap_std(data, n)
# y = np.array(y)
# plt.hist(y, bins=50)
# plt.xlim(0.82, 1.18)

# print(np.mean(y))
# print(np.mean(mu1_bootstrap))

HTML('''
<button onclick="toggleCell()">Show me the answer</button>
''')

## Confidence estimation: Jackknife

In [None]:
from astroML.resample import jackknife
from astroML.stats import sigmaG

np.random.seed(123)
m = 1000
data = norm(0, 1).rvs(m)

# Compute jackknife resampling

# Standard deviation based
mu_s, sigma_mu_s, mu_s_raw = jackknife(data, np.std,
                                    kwargs=dict(axis=1, ddof=1),
                                    return_raw_distribution=True)

pdf1_theory = norm(1, 1. / np.sqrt(2 * (m - 1)))
pdf1_jackknife = norm(mu_s, sigma_mu_s)

# Sigma_G based
mu_sigG, sigma_mu_sigG, mu_sigG_raw = jackknife(data, sigmaG,
                                    kwargs=dict(axis=1),
                                    return_raw_distribution=True)
pdf2_theory = norm(data.std(), 1.06 / np.sqrt(m))
pdf2_jackknife = norm(mu_sigG, sigma_mu_sigG)


print(f"mu_s = {mu_s:.3}, sigma_mu_s = {sigma_mu_s:.3}")
print(f"mu_sigmaG = {mu_sigG:.3}, sigma_mu_sigmaG = {sigma_mu_sigG:.3}")

In [None]:
fig = plt.figure(figsize=(8, 5))
fig.subplots_adjust(left=0.11, right=0.95, bottom=0.2, top=0.9,
                    wspace=0.25)

ax = fig.add_subplot(121)
ax.hist(mu_s_raw, np.linspace(0.996, 1.008, 100),
        label=r'$\sigma^*\ {\rm (std.\ dev.)}$',
        histtype='stepfilled', fc='white', ec='black', density=False)
ax.hist(mu_sigG_raw, np.linspace(0.996, 1.008, 100),
        label=r'$\sigma_G^*\ {\rm (quartile)}$',
        histtype='stepfilled', fc='gray', density=False)
ax.legend(loc='upper left', handlelength=2, fontsize = 14)

ax.xaxis.set_major_locator(plt.MultipleLocator(0.004))
ax.set_xlabel(r'$\sigma^*$', fontsize = 14)
ax.set_ylabel(r'$N(\sigma^*)$', fontsize = 14)
ax.set_xlim(0.998, 1.008)
ax.set_ylim(0, 550)

ax = fig.add_subplot(122)
x = np.linspace(0.45, 1.15, 1000)
ax.plot(x, pdf1_jackknife.pdf(x),
        color='blue', ls='dashed', label=r'$\sigma\ {\rm (std.\ dev.)}$',
        zorder=2)
ax.plot(x, pdf1_theory.pdf(x), color='gray', zorder=1)

ax.plot(x, pdf2_jackknife.pdf(x),
        color='red', label=r'$\sigma_G\ {\rm (quartile)}$', zorder=2)
ax.plot(x, pdf2_theory.pdf(x), color='gray', zorder=1)
plt.legend(loc='upper left', handlelength=2, fontsize = 14)

ax.set_xlabel(r'$\sigma$', fontsize = 14)
ax.set_ylabel(r'$p(\sigma|x,I)$', fontsize = 14)
ax.set_xlim(0.45, 1.15)
ax.set_ylim(0, 24)

This failure is a general problem with the standard jackknife method, which performs well for smooth differential statistics such as the mean and standard deviation, but does not perform well for medians, quantiles, and other rank-based statistics. For these sorts of statistics, a jackknife implementation that removes more than one observation can overcome this problem. The reason for this failure becomes apparent upon examination of the figure above: for $\sigma_G$, the vast majority of jackknife samples yield one of three discrete values! Because quartiles are insensitive to the removal of outliers, all samples created by the removal of a point larger than $q_{75}$ lead to precisely the same estimate. The same is true for removal of any point smaller than $q_{25}$, and for any point in the range $q_{25} < x < q_{75}$. Because of this, the jackknife cannot accurately sample the error distribution, which leads to a gross misestimate of the result.

In [None]:
# convinient functions in astropy

from astropy.stats import jackknife_stats

x = np.random.normal(loc=0, scale=1, size=1000)
estimate, bias, stderr, conf_interval = jackknife_stats(x, np.std)
print(estimate , stderr)

## Hypothesis testing

### Rejecting a null hypothesis

We flip a coin eight times and get six tails; should we reject the hypothesis that the coin is fair? We will assume the null hypothesis that the coin is indeed fair. Recall that we can find probabilities of coin flips with the binomial distribution,

$$ p(k|b,N) = \frac{N!}{k!(N-k)!} b^k (1-b)^{N-k}. $$

Since p-values are defined as the probability that something *at least* as extreme as your data could have occurred (assuming the null hypothesis is correct), we can find the p-value by adding the probability of 6/8, 7/8, and 8/8 tails.

$$ \frac{8!}{6!2!}\frac{1}{2}^6 \frac{1}{2}^2 + \frac{8!}{7!1!}\frac{1}{2}^7 \frac{1}{2}^1 + \frac{8!}{8!0!}\frac{1}{2}^8 \frac{1}{2}^0$$

We get that the probability of this occurring is 0.145; thus, we cannot reject the null hypothesis at the 0.05 significance level.


### Hypothesis testing and classification

This is the example in class. 

Assume that $h_B(x) = \mathcal{N} (\mu = 100, \sigma = 10) $ and $h_s(x) = \mathcal{N} (\mu = 150, \sigma = 12)$, with $a$ = 0.1 and $N = 10^6$ (this will be image with 1000 x 1000 resolution elements; the $x$ values correspond to the sum of background and source counts). We will plot these two distributions below.

In [None]:
from scipy.stats import norm

# Generate and draw the curves
x = np.linspace(50, 200, 1000)
p1 = 0.9 * norm(100, 10).pdf(x)
p2 = 0.1 * norm(150, 12).pdf(x)

# plot the distributions
fig, ax = plt.subplots(figsize=(14, 8))
ax.fill(x, p1, ec='k', fc='#AAAAAA', alpha=0.5)
ax.fill(x, p2, '-k', fc='#AAAAAA', alpha=0.5)

# plot x_c = 120
ax.plot([120, 120], [0.0, 0.04], '--k')

ax.text(100, 0.036, r'$h_B(x)$', ha='center', va='bottom', fontsize = 14)
ax.text(150, 0.0035, r'$h_S(x)$', ha='center', va='bottom', fontsize = 14)
ax.text(122, 0.039, r'$x_c=120$', ha='left', va='top', fontsize = 14)
ax.text(125, 0.01, r'$(x > x_c\ {\rm classified\ as\ sources})$', fontsize = 14)

ax.set_xlim(50, 200)
ax.set_ylim(0, 0.04)

ax.set_xlabel('$x$', fontsize = 14)
ax.set_ylabel('$p(x)$', fontsize = 14)
plt.show()

If we naively choose $x_c$=120 (a "$2\sigma$ cut” away from the mean for $h_B$, corresponding to a Type I error probability of $\alpha$ = 0.024 ((1-95.45\%)/2), **21,600 values will be incorrectly classified as a source!** The sample completeness for this value of $x_c$ is 0.994 and **99,400 values are correctly classified as a source.** Although the Type I error rate is only 0.024, the sample contamination is 21,600/(21,600+99,400) = 0.179, or over 7 times higher!

## Comparison of distributions: KS-tests


In [None]:
from scipy import stats

np.random.seed(4)
plt.figure(figsize=(8, 6))

plt.step(np.sort(stats.norm.rvs(0,3,25)), np.linspace(0, 1, 25) ,lw = 3)
plt.plot(np.sort(stats.norm.rvs(0,3,1000)), np.linspace(0, 1, 1000), lw=3)

plt.annotate("", xy=(2.3, 0.965), xytext=(2.3, 0.77),
            arrowprops=dict(arrowstyle="<->",lw=2))

plt.text(2.6,0.86, "D", fontsize = 20)

plt.legend(['CDF 1', 'CDF 2'])
plt.title('Comparing CDFs for K-S test')
plt.show()

In [None]:
# ask if a sample is from a certain kind of distribution
np.random.seed(0)
vals = np.random.normal(loc=0, scale=1, size= 1000)

print(f'Normal: {stats.kstest(vals, "norm")}')
print(f'Uniform: {stats.kstest(vals, "uniform")}')


In [None]:
# ask if two samples come from the same underlying distribution

np.random.seed(0)
sample1 = np.random.uniform(low=0.0, high=1.0,size=100)
sample2 = np.random.normal(loc=0.0, scale=1.0,size=110)
sample3 = np.random.normal(loc=0.0, scale=1.0,size=95)

print(f'Uniform vs. Normal: {stats.ks_2samp(sample1, sample2)}')
print(f'Normal vs. Normal: {stats.ks_2samp(sample2, sample3)}')

## Test of Gaussianity

**Anderson-Darling test**

The test is based on the statistic 

$$ A^2 = -N - \frac{1}{N}\sum^N_{i=1}[(2i-1)\ln(F_i)+(2N-2i+1)\ln(1-F_i)] $$

where $F_i$ is the $i$th value of the cumulative distribution function $z_i$, which is defined as 

$$z_i = \frac{x_i-\mu}{\sigma}$$

and assumed to be in ascending order. In this expression, either one or both of $\mu$ and $\sigma$ can be known or determined from data $\{x_i\}$. Assuming Gaussianity, one can consult a lookup table for $A^2$ to determine whether the hypothesis of Gauussianity of rejected.

**Shapiro-Wilk test**

Based on both data values $x_i$ and data ranks $R_i$:

$$W = \frac{\big(\sum^N_{i=1}a_iR_i\big)^2}{\sum^N_{i=1}(x_i-\overline{x})^2} $$

where constants $a_i$ encode the expected values of the order statistics for random variables sampled from the standard normal distribution (the test's null hypothesis) -- draw N samples from a standard normal distribution and order them. The Shapiro-Wilk test is very sensitive to non-Gaussian tails of the distribution ("outliers") but not as much to detailed departures from Gaussianity in the distribution's core.

In [None]:
from astroML.stats import mean_sigma, median_sigmaG

# create distributions
np.random.seed(1)
normal_vals = stats.norm(0, 1).rvs(10000) # singular Gaussian

dual_vals = stats.norm(0, 1).rvs(10000)
dual_vals[:4000] = stats.norm(3,2).rvs(4000) # mixture of two Gaussians

x = np.linspace(-4, 10, 1000)
normal_pdf = stats.norm(0, 1).pdf(x) # pdf for singular Gaussian
dual_pdf = 0.6 * stats.norm(0, 1).pdf(x) + 0.4 * stats.norm(3, 2).pdf(x) 
#pdf for mixture of two Gaussians

vals = [normal_vals, dual_vals]
pdf = [normal_pdf, dual_pdf]
xlims = [(-4, 4), (-4, 10)]

In [None]:
# Compute the statistics and plot the results
fig = plt.figure(figsize=(8, 6))

for i in range(2):
    ax = fig.add_subplot(2,1,i+1)

    # compute statistics
    
    # Anderson-Darling test
    A2, sig, crit = stats.anderson(vals[i]) 
    
    # KS test
    D, pD = stats.kstest(vals[i], "norm")  
    
    # Shapiro test
    W, pW = stats.shapiro(vals[i])  
    
    mu, sigma = mean_sigma(vals[i], ddof=1)
    median, sigmaG = median_sigmaG(vals[i])


    # display results in a table
    print(70 * '_')
    print("  Kolmogorov-Smirnov test: D = %.2g  p = %.2g " % (D, pD))
    print("  Anderson-Darling test: A^2 = %.2g" % A2)
    print("    significance  | critical value ")
    print("    --------------|----------------")
    for j in range(len(sig)):
        print("    {0:.2f}          | {1:.1f}%".format(sig[j], crit[j]))
    print("  Shapiro-Wilk test: W = %.2g p = %.2g" % (W, pW))


    # plot a histogram
    ax.hist(vals[i], bins=50, density=True, histtype='stepfilled', alpha=0.5)
    ax.plot(x, pdf[i], '-k')
    ax.set_xlim(xlims[i])

    # print information on the plot
    info = "Anderson-Darling: $A^2 = %.2f$\n" % A2
    info += "Kolmogorov-Smirnov: $D = %.2g$\n" % D
    info += "Shapiro-Wilk: $W = %.2g$\n" % W
    ax.text(0.97, 0.97, info, ha='right', va='top', 
            transform=ax.transAxes, fontsize = 12)

    if i == 0:
        ax.set_ylim(0, 0.55)
        ax.tick_params(axis='x', labelsize=12)
        ax.tick_params(axis='y', labelsize=12)
    else:
        ax.set_ylim(0, 0.35)
        ax.set_xlabel('$x$', fontsize = 14)
        ax.tick_params(axis='x', labelsize=12)
        ax.tick_params(axis='y', labelsize=12)
        
    ax.set_ylabel('$p(x)$', fontsize = 14)