# In-class notebook: 2025-01-13

In this notebook, we will get familiar with a number of common distributions that you might encounter in your research. We will then look at some illustration of the central limit theory. And finally how to generate a random number from an arbitrary distribution.

This notebook is intended to support Chapter 3.3-3.7 of the textbook, and material is taken from the following scripts (from astroML):

* https://github.com/astroML/astroML-notebooks/blob/main/chapter3/astroml_chapter3_Univariate_Distribution_Functions.ipynb
* https://github.com/astroML/astroML-notebooks/blob/main/chapter3/astroml_chapter3_The_Central_Limit_Theorem.ipynb
* https://github.com/astroML/astroML_figures/blob/main/book_figures/chapter3/fig_clone_distribution.py

## First let's just plot all the distributions we learned in class

### Uniform

In [None]:
from scipy.stats import uniform
import numpy as np
import matplotlib.pylab as plt

W_values = [1.0, 2.0, 3.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-2, 2, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for W, ls in zip(W_values, linestyles):
    left = mu - 0.5 * W 
    dist = uniform(left, W)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\mu=%i,\ W=%i$' % (mu, W))

plt.xlim(-1.7, 1.7)
plt.ylim(0, 1.2)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\mu, W)$', fontsize = 12)
plt.title('Uniform Distribution', fontsize = 12)

plt.legend()

### Gaussian

In [None]:
from scipy.stats import norm

sigma_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-10, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for sigma, ls in zip(sigma_values, linestyles):
    
    dist = norm(mu, sigma)  # creates the gaussian distribution

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\mu=%i,\ \sigma=%.1f$' % (mu, sigma)) # pdf calculated at x

plt.xlim(-5, 5)
plt.ylim(0, 0.85)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\mu,\sigma)$', fontsize = 12)
plt.title('Gaussian Distribution', fontsize = 12)

plt.legend()

### Binomial

In [None]:
from scipy.stats import binom

n_values = [20, 20, 40]
b_values = [0.2, 0.6, 0.6]
linestyles = ['-', '--', ':']
k = np.arange(-1, 200)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for (n, b, ls) in zip(n_values, b_values, linestyles):
    
    dist = binom(n, b) # creates a binomial distribution
    plt.plot(k, dist.pmf(k), color='black',linestyle=ls, 
             # probability mass function (https://en.wikipedia.org/wiki/Probability_mass_function)
             drawstyle= 'steps-mid', # this is to make it look like a histogram
             label=r'$b=%.1f,\ n=%i$' % (b, n))  # pmf calculated at x

plt.xlim(-0.5, 35)
plt.ylim(0, 0.25)

plt.xlabel('$k$', fontsize = 12)
plt.ylabel(r'$p(k|b, n)$', fontsize = 12)
plt.title('Binomial Distribution', fontsize = 12)

plt.legend()

### Poisson

In [None]:
from scipy.stats import poisson

k = np.arange(-1, 200)
mu_values = [1, 5, 15]
linestyles = ['-', '--', ':']

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for mu, ls in zip(mu_values, linestyles):
    
    dist = poisson(mu)

    plt.plot(k, dist.pmf(k), color='black',
             linestyle=ls,
             drawstyle = 'steps-mid',
             label=r'$\mu=%i$' % mu)

plt.xlim(-0.5, 30)
plt.ylim(0, 0.4)

plt.xlabel('$k$', fontsize = 12)
plt.ylabel(r'$p(k|\mu)$', fontsize = 12)
plt.title('Poisson Distribution', fontsize = 12)

plt.legend()

### Cauchy (Lorentzian)

In [None]:
from scipy.stats import cauchy

gamma_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-10, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for gamma, ls in zip(gamma_values, linestyles):
    dist = cauchy(mu, gamma)

    plt.plot(x, dist.pdf(x), ls=ls, color='black',
             label=r'$\mu=%i,\ \gamma=%.1f$' % (mu, gamma))

plt.xlim(-4.5, 4.5)
plt.ylim(0, 0.65)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\mu,\gamma)$', fontsize = 12)
plt.title('Cauchy Distribution', fontsize = 12)

plt.legend()

### Laplace (exponential)

In [None]:
from scipy.stats import laplace

delta_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-10, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for delta, ls in zip(delta_values, linestyles):
    dist = laplace(mu, delta)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta))

plt.xlim(-6, 6)
plt.ylim(0, 1.0)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\mu,\Delta)$', fontsize = 12)
plt.title('Laplace Distribution', fontsize = 12)

plt.legend()

In [None]:
from scipy.stats import expon

delta_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(0, 10, 1000)[1:]

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for delta, ls in zip(delta_values, linestyles):
    dist = laplace(mu, delta)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta))

plt.xlim(0, 6)
plt.ylim(0, 1.0)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\mu,\Delta)$', fontsize = 12)
plt.title('Exponential Distribution', fontsize = 12)

plt.legend()

$\chi^2$

In [None]:
from scipy.stats import chi2

k_values = [1, 2, 5, 7]
linestyles = ['-', '--', ':', '-.']
mu = 0
x = np.linspace(-1, 20, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))
fig.subplots_adjust(bottom=0.12)

for k, ls in zip(k_values, linestyles):
    dist = chi2(k, mu)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$k=%i$' % k)

plt.xlim(0, 10)
plt.ylim(0, 0.5)

plt.xlabel('$Q$', fontsize = 12)
plt.ylabel(r'$p(Q|k)$', fontsize = 12)
plt.title(r'$\chi^2\ \mathrm{Distribution}$', fontsize = 12)

plt.legend()

### Student’s t

In [None]:
from scipy.stats import t as student_t

mu = 0
k_values = [1E10, 2, 1, 0.5]

linestyles = ['-', '--', ':', '-.']
x = np.linspace(-10, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for k, ls in zip(k_values, linestyles):
    dist = student_t(k, 0)

    if k >= 1E10:
        label = r'$\mathrm{t}(k=\infty)$'
    else:
        label = r'$\mathrm{t}(k=%.1f)$' % k

    plt.plot(x, dist.pdf(x), ls=ls, c='black', label=label)

plt.xlim(-5, 5)
plt.ylim(0.0, 0.45)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|k)$', fontsize = 12)
plt.title("Student's $t$ Distribution", fontsize = 12)

plt.legend()

### Fisher’s F

In [None]:
from scipy.stats import f as fisher_f

mu = 0
d1_values = [1, 5, 2, 10]
d2_values = [1, 2, 5, 50]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(0, 5, 1001)[1:]

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for (d1, d2, ls) in zip(d1_values, d2_values, linestyles):
    dist = fisher_f(d1, d2, mu)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$d_1=%i,\ d_2=%i$' % (d1, d2))

plt.xlim(0, 4)
plt.ylim(0.0, 1.0)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|d_1, d_2)$', fontsize = 12)
plt.title("Fisher's Distribution", fontsize = 12)

plt.legend()

### Beta

In [None]:
from scipy.stats import beta

alpha_values = [0.5, 1.5, 3.0, 0.5]
beta_values = [0.5, 1.5, 3.0, 1.5]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(0, 1, 1002)[1:-1]

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for a, b, ls in zip(alpha_values, beta_values, linestyles):
    dist = beta(a, b)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\alpha=%.1f,\ \beta=%.1f$' % (a, b))

plt.xlim(0, 1)
plt.ylim(0, 3)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|\alpha,\beta)$', fontsize = 12)
plt.title('Beta Distribution', fontsize = 12)

plt.legend(loc=0)

### Gamma

In [None]:
from scipy.stats import gamma

k_values = [1, 2, 3, 5]
theta_values = [2, 1, 1, 0.5]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(1E-6, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for k, t, ls in zip(k_values, theta_values, linestyles):
    dist = gamma(k, 0, t) # zero is the location parameter
    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$k=%.1f,\ \theta=%.1f$' % (k, t))

plt.xlim(0, 10)
plt.ylim(0, 0.45)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|k,\theta)$', fontsize = 12)
plt.title('Gamma Distribution', fontsize = 12)

plt.legend(loc=0)

### Weibull

In [None]:
from scipy.stats import weibull_min

k_values = [0.5, 1, 2, 2]
lam_values = [1, 1, 1, 2]
linestyles = ['-', '--', ':', '-.', '--']
mu = 0
x = np.linspace(-10, 10, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4.5))

for (k, lam, ls) in zip(k_values, lam_values, linestyles):
    dist = weibull_min(k, mu, lam)
    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$k=%.1f,\ \lambda=%i$' % (k, lam))

plt.xlim(0, 5)
plt.ylim(0, 1)

plt.xlabel('$x$', fontsize = 12)
plt.ylabel(r'$p(x|k,\lambda)$', fontsize = 12)
plt.title('Weibull Distribution', fontsize = 12)

plt.legend()

## Central Limit Theorem

In [None]:
# Generate the uniform samples
N = [2, 3, 10]

np.random.seed(42)
x = np.random.random((max(N), int(1E6)))
print(x.shape)

In [None]:
# Plot the results
fig = plt.figure(figsize=(8, 8))
fig.subplots_adjust(hspace=0.05)

for i in range(len(N)):
    ax = fig.add_subplot(3, 1, i + 1)

    # take the mean of the first N[i] samples
    x_i = x[:N[i], :].mean(0)

    # histogram the data
    ax.hist(x_i, bins=np.linspace(0, 1, 101),
            histtype='stepfilled', alpha=0.5, density=True)

    # plot the expected gaussian pdf
    mu = 0.5
    sigma = 1. / np.sqrt(12 * N[i]) # a uniform dist. has sigma 1/sqrt(12)
    dist = norm(mu, sigma)
    x_pdf = np.linspace(-0.5, 1.5, 1000)
    ax.plot(x_pdf, dist.pdf(x_pdf), '-k')

    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.001, None)

    ax.xaxis.set_major_locator(plt.MultipleLocator(0.2))
    ax.yaxis.set_major_locator(plt.MaxNLocator(5))

    ax.text(0.99, 0.95, r"$N = %i$" % N[i],
            ha='right', va='top', transform=ax.transAxes)

    if i == len(N) - 1:
        ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.4f'))
        ax.set_xlabel(r'$x$', fontsize = 12)
    else:
        ax.xaxis.set_major_formatter(plt.NullFormatter())

    ax.set_ylabel('$p(x)$', fontsize = 12)

In [None]:
# Generate the laplace samples
N = [2, 10, 20]

np.random.seed(42)
x = np.random.laplace(0.5,(1/np.sqrt(12)),(max(N), int(1E6)))

In [None]:
# Plot the results
fig = plt.figure(figsize=(8, 8))
fig.subplots_adjust(hspace=0.05)

for i in range(len(N)):
    ax = fig.add_subplot(3, 1, i + 1)

    # take the mean of the first N[i] samples
    x_i = x[:N[i], :].mean(0)

    # histogram the data
    ax.hist(x_i, bins=np.linspace(0, 1, 101),
            histtype='stepfilled', alpha=0.5, density=True)

    # plot the expected gaussian pdf
    mu = 0.5
    sigma = (1/np.sqrt(12*N[i]))*np.sqrt(2)
    dist = norm(mu, sigma)
    x_pdf = np.linspace(-0.5, 1.5, 1000)
    ax.plot(x_pdf, dist.pdf(x_pdf), '-k')

    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.001, None)

    ax.xaxis.set_major_locator(plt.MultipleLocator(0.2))
    ax.yaxis.set_major_locator(plt.MaxNLocator(5))

    ax.text(0.99, 0.95, r"$N = %i$" % N[i],
            ha='right', va='top', transform=ax.transAxes)

    if i == len(N) - 1:
        ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.4f'))
        ax.set_xlabel(r'$x$', fontsize = 12)
    else:
        ax.xaxis.set_major_formatter(plt.NullFormatter())

    ax.set_ylabel('$p(x)$', fontsize = 12)

## Next let's generate a random number from an arbitrary distribution (or, cloning a distribution)

In [None]:
from scipy import stats, interpolate
from astropy.visualization import hist
from astroML.density_estimation import EmpiricalDistribution

#------------------------------------------------------------
# Create a distribution and clone it
Ndata = 1000
Nclone = 100000
np.random.seed(0)

# generate an 'observed' bimodal distribution with 10000 values
dists1 = stats.norm(-1.3, 0.5)
fracs1 = 0.6
dists2 = stats.norm(1.3, 0.5)
fracs2 = 0.4

x = np.hstack((dists1.rvs(int(fracs1 * Ndata)), dists2.rvs(int(fracs2 * Ndata))))
plt.hist(x, bins=40)
plt.xlabel('x')

In [None]:
# We can clone the distribution easily with this function
x_cloned = EmpiricalDistribution(x).rvs(Nclone)

plt.hist(x_cloned, bins=40)
plt.xlabel('x_cloned')

In [None]:
# compute the KS test to check if they're the same
D, p = stats.ks_2samp(x, x_cloned)
print("KS test: D = %.2g; p = %.2g" % (D, p))

In [None]:
# Now let's try to do the clone by hand

# create a cumulative distribution
x.sort()
Px_cuml = np.linspace(0, 1, Ndata)

# this gives you the CDF
plt.figure(figsize=(15,3))
plt.plot(x, Px_cuml, marker='o', markersize=0.5, lw=0.2)
plt.ylabel('regular grid from 0 to 1')
plt.xlabel('sorted x')

In [None]:
# rotate and sample from the axis between 0 and 1

plt.figure(figsize=(15,3))
plt.plot(Px_cuml, x, marker='o', markersize=0.2, lw=0.2)
plt.xlabel('regular grid from 0 to 1')
plt.ylabel('sorted x')

In [None]:
# set up an interpolation of the inverse cumulative distribution
tck = interpolate.splrep(Px_cuml, x)

# sample evenly along the cumulative distribution, and interpolate
Px_cuml_sample = np.linspace(0, 1, 10 * Ndata)
x_sample = interpolate.splev(Px_cuml_sample, tck)


In [None]:
# play with the bins option
hist(x, bins='knuth', 
     histtype='stepfilled', density=True,
     ec='#AAAAAA', fc='#DDDDDD',
     label='input data')
hist(x_sample, bins='knuth', 
     histtype='step', density=True,
     color='k', label='cloned data')
plt.legend()
plt.xlabel('x')

In [None]:
# Plot the cloned distribution and the procedure for obtaining it
fig = plt.figure(figsize=(5, 5))
fig.subplots_adjust(hspace=0.3, left=0.1, right=0.95,
                    bottom=0.08, top=0.92, wspace=0.3)

indices = np.linspace(0, Ndata - 1, 20).astype(int)

# plot a histogram of the input
ax = fig.add_subplot(221)
hist(x, bins='knuth', ax=ax,
     histtype='stepfilled', ec='k', fc='#AAAAAA')
ax.set_ylim(0, 300)
ax.set_title('Input data distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$N(x)$')

# plot the cumulative distribution
ax = fig.add_subplot(222)
ax.scatter(x[indices], Px_cuml[indices], lw=0, c='k', s=9)
ax.plot(x, Px_cuml, '-k')
ax.set_xlim(-3, 3)
ax.set_ylim(-0.05, 1.05)
ax.set_title('Cumulative Distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(<x)$')

# plot the inverse cumulative distribution and spline fit
ax = fig.add_subplot(223)
ax.scatter(Px_cuml[indices], x[indices], lw=0, c='k', s=9)
ax.plot(Px_cuml_sample, x_sample, '-k')
ax.arrow(0.7, -3, 0, 3.5, width=0.015, fc='gray', ec='gray',
         head_width=0.05, head_length=0.4)
ax.arrow(0.7, 0.9, -0.69, 0, width=0.1, fc='gray', ec='gray',
         head_width=0.3, head_length=0.06)
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-3, 3)
ax.set_title('Inverse Cuml. Distribution')
ax.set_xlabel('$p(<x)$')
ax.set_ylabel('$x$')

# plot the resulting cloned distribution
ax = fig.add_subplot(224)
hist(x, bins='knuth', ax=ax,
     histtype='stepfilled', density=True,
     ec='#AAAAAA', fc='#DDDDDD',
     label='input data')
hist(x_sample, bins='knuth', ax=ax,
     histtype='step', density=True,
     color='k', label='cloned data')
ax.set_title('Cloned Distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)dx$')
ax.text(0.74, 0.95, "KS test:\nD = %.2f\np = %.2f" % (D, p),
        ha='left', va='top', transform=ax.transAxes)