# In-class notebook: 2025-01-08

In this notebook, we will look at an example for marginal and conditional probability, how random variables transform under certain operations, and review some commonly used descriptive statistics.

This notebook is intended to support Chapter 3.1-3.2 of the textbook, and material is taken from the following script (from astroML):
* https://github.com/astroML/astroML-notebooks/blob/main/chapter3/astroml_chapter3_Overview_of_Probability_and_Random_Variables.ipynb
* https://github.com/astroML/astroML-notebooks/blob/main/chapter3/astroml_chapter3_Descriptive_Statistics.ipynb

## Marginal and conditional probability

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.ticker import NullFormatter

def banana_distribution(N=10000):
    """This generates random points in a banana shape"""
    # create a truncated normal distribution
    theta = np.random.normal(0, np.pi / 8, N)
    theta[theta >= np.pi / 4] /= 2
    theta[theta <= -np.pi / 4] /= 2
    # define the curve parametrically
    r = np.sqrt(1. / abs(np.cos(theta) ** 2 - np.sin(theta) ** 2))
    r += np.random.normal(0, 0.08, size=N)
    x = r * np.cos(theta + np.pi / 4)
    y = r * np.sin(theta + np.pi / 4)
    return (x, y)

# Generate the data and compute the normalized 2D histogram
np.random.seed(0)
x, y = banana_distribution(10000)

Ngrid = 41
grid = np.linspace(0, 2, Ngrid + 1)

H, xbins, ybins = np.histogram2d(x, y, (grid, grid))

# PDFs should be properly normalized
H /= np.sum(H)

In [None]:
# H is a 2D array, note that the histogram2d convention requires a transpose
plt.imshow(H.T, origin='lower', cmap=plt.cm.binary, extent=[0, 2, 0, 2])
plt.xlabel('x')
plt.ylabel('y')
plt.title('Am example 2D PDF')

print(np.sum(H))

In [None]:
# plot the result
fig = plt.figure(figsize=(10, 5))

# define axes
ax_Pxy = plt.axes((0.2, 0.34, 0.27, 0.52))
ax_Px = plt.axes((0.2, 0.14, 0.27, 0.2))
ax_Py = plt.axes((0.1, 0.34, 0.1, 0.52))
ax_cb = plt.axes((0.48, 0.34, 0.01, 0.52))
ax_Px_y = [plt.axes((0.65, 0.62, 0.32, 0.23)),
           plt.axes((0.65, 0.38, 0.32, 0.23)),
           plt.axes((0.65, 0.14, 0.32, 0.23))]

# set axis label formatters
ax_Px_y[0].xaxis.set_major_formatter(NullFormatter())
ax_Px_y[1].xaxis.set_major_formatter(NullFormatter())

ax_Pxy.xaxis.set_major_formatter(NullFormatter())
ax_Pxy.yaxis.set_major_formatter(NullFormatter())

ax_Px.yaxis.set_major_formatter(NullFormatter())
ax_Py.xaxis.set_major_formatter(NullFormatter())

# draw the joint probability
plt.axes(ax_Pxy)
H *= 1000
plt.imshow(H, interpolation='nearest', origin='lower', aspect='auto',
           extent=[0, 2, 0, 2], cmap=plt.cm.binary)

cb = plt.colorbar(cax=ax_cb)
cb.set_label('$p(x, y)$', fontsize = 14)
plt.text(0, 1.02, r'$\times 10^{-3}$',
         transform=ax_cb.transAxes)

# draw p(x) distribution
ax_Px.plot(xbins[1:], H.sum(0), '-k', drawstyle='steps')

# draw p(y) distribution
ax_Py.plot(H.sum(1), ybins[1:], '-k', drawstyle='steps')

# define axis limits
ax_Pxy.set_xlim(0, 2)
ax_Pxy.set_ylim(0, 2)
ax_Px.set_xlim(0, 2)
ax_Py.set_ylim(0, 2)

# label axes
ax_Pxy.set_xlabel('$x$', fontsize = 14)
ax_Pxy.set_ylabel('$y$', fontsize = 14)
ax_Px.set_xlabel('$x$', fontsize = 14)
ax_Px.set_ylabel('$p(x)$', fontsize = 14)
ax_Px.yaxis.set_label_position('right')
ax_Py.set_ylabel('$y$', fontsize = 14)
ax_Py.set_xlabel('$p(y)$', fontsize = 14)
ax_Py.xaxis.set_label_position('top')

ax_Px.tick_params(axis='both', which='major', labelsize=10)
ax_Py.tick_params(axis='both', which='major', labelsize=10)

# draw conditional probabilities
iy = [3 * Ngrid // 4, Ngrid // 2, Ngrid // 4]
colors = 'rgc'
axis = ax_Pxy.axis()

for i in range(3):
    # overplot range on joint probability
    ax_Pxy.plot([0, 2, 2, 0],
                [ybins[iy[i] + 1], ybins[iy[i] + 1],
                 ybins[iy[i]], ybins[iy[i]]], c=colors[i], lw=1)
    Px_y = H[iy[i]] / H[iy[i]].sum()
    ax_Px_y[i].plot(xbins[1:], Px_y, drawstyle='steps', c=colors[i])
    ax_Px_y[i].yaxis.set_major_formatter(NullFormatter())
    ax_Px_y[i].set_ylabel('$p(x | y = %.1f)$' % ybins[iy[i]], fontsize = 14)
    ax_Px_y[i].tick_params(axis='both', which='major', labelsize=10)
ax_Pxy.axis(axis)

ax_Px_y[2].set_xlabel('$x$', fontsize = 14)


## Transformations of random variables

In [None]:
from scipy import stats

# Set up the data
np.random.seed(0)

# create a uniform distribution
uniform_dist = stats.uniform(0, 1)
x_sample = uniform_dist.rvs(1000)
x = np.linspace(-0.5, 1.5, 1000)
Px = uniform_dist.pdf(x)

# transform the data
y_sample = np.exp(x_sample)

# transform the model
y = np.exp(x)
Py = Px / y


In [None]:
# Plot the results
fig = plt.figure(figsize=(10, 4))
fig.subplots_adjust(left=0.11, right=0.95, wspace=0.3, bottom=0.17, top=0.9)

ax = fig.add_subplot(121)
ax.hist(x_sample, 20, histtype='stepfilled', fc='#CCCCCC', density=True)
ax.plot(x, Px, '-k')
ax.set_xlim(-0.2, 1.2)
ax.set_ylim(0, 1.4001)
ax.xaxis.set_major_locator(plt.MaxNLocator(6))
ax.text(0.95, 0.95, r'$p_x(x) = {\rm Uniform}(x)$',
        va='top', ha='right',
        transform=ax.transAxes, fontsize = 12)
ax.set_xlabel('$x$', fontsize = 14)
ax.set_ylabel('$p_x(x)$', fontsize = 14)


ax = fig.add_subplot(122)
ax.hist(y_sample, 20, histtype='stepfilled', fc='#CCCCCC', density=True)
ax.plot(y, Py, '-k')
ax.set_xlim(0.85, 2.9)
ax.xaxis.set_major_locator(plt.MaxNLocator(6))
ax.text(0.95, 0.95, '$y=\exp(x)$\n$p_y(y)=p_x(\ln y) / y$',
        va='top', ha='right',
        transform=ax.transAxes, fontsize = 12)
ax.set_xlabel('$y$',fontsize = 14)
ax.set_ylabel('$p_y(y)$', fontsize = 14)

## Now in a practical example -- transforming flux distributions into magnitude distributions

In [None]:
from scipy.stats import norm
np.random.seed(1)

# create distribution with 1% flux errors
dist = norm(1, 0.01)

# data points
flux = dist.rvs(10000)

# functional form of distribution
flux_fit = np.linspace(0.001, 2, 1000)
pdf_flux_fit = dist.pdf(flux_fit)

# transform this distribution into magnitude space
mag = -2.5 * np.log10(flux) 
# point-by-point transform

mag_fit = -2.5 * np.log10(flux_fit)
pdf_mag_fit = pdf_flux_fit.copy()
# transform the x-axis but not the y-axis

# this normalization is important, first step adjust relative grid size change (FdF = mdm)
pdf_mag_fit[1:] /= abs(mag_fit[1:] - mag_fit[:-1])
# next overall normalization
pdf_mag_fit /= np.dot(pdf_mag_fit[1:], abs(mag_fit[1:] - mag_fit[:-1]))

# do this analytically too
pdf_mag_fit2 = pdf_flux_fit/2.5*flux_fit
pdf_mag_fit2 /= np.dot(pdf_mag_fit2[1:], abs(mag_fit[1:] - mag_fit[:-1]))

In [None]:
# take a look at what has been done
plt.figure(figsize=(10,3))
plt.subplot(141)
plt.plot(flux_fit, pdf_flux_fit)
plt.xlabel('flux')
plt.ylabel('P(flux)')
plt.xlim(0.8,1.2)
plt.subplot(142)
plt.plot(mag_fit, pdf_flux_fit)
plt.xlabel('mag')
plt.ylabel('P(flux)')
plt.xlim(-0.3,0.3)
plt.subplot(143)
plt.plot(mag_fit, pdf_mag_fit)
plt.plot(mag_fit, pdf_mag_fit2, ls='--')
plt.xlabel('mag')
plt.ylabel('P(mag)')
plt.xlim(-0.3,0.3)
plt.tight_layout()

In [None]:
# create distribution with 25% flux errors
dist25 = norm(1, 0.20)
flux25 = dist25.rvs(10000)
flux_fit25 = np.linspace(0.001, 2, 1000)
pdf_flux_fit25 = dist25.pdf(flux_fit25)

# transform this distribution into magnitude space
mag25 = -2.5 * np.log10(flux25)
mag_fit25 = -2.5 * np.log10(flux_fit25)
pdf_mag_fit25 = pdf_flux_fit25.copy()
pdf_mag_fit25[1:] /= abs(mag_fit25[1:] - mag_fit25[:-1])
pdf_mag_fit25 /= np.dot(pdf_mag_fit25[1:], abs(mag_fit25[1:] - mag_fit25[:-1]))

In [None]:
fig, ax = plt.subplots(2,2)                
fig.set_size_inches(10,8)   
fig.tight_layout(w_pad=4, h_pad=7)

#------------------------------------------------------------ 1% flux error
ax[0,0].hist(flux, bins=np.linspace(0.75, 1.25, 50),
        histtype='stepfilled', fc='gray', alpha=0.5, density=True)
ax[0,0].plot(flux_fit, pdf_flux_fit, '-k')
ax[0,0].plot([1, 1], [0, 42], ':k', lw=1)
ax[0,0].set_xlim(0.75, 1.25)
ax[0,0].set_xlabel(r'${\rm flux}$', fontsize = 12)
ax[0,0].set_ylabel(r'$p({\rm flux})$', fontsize = 12)
ax[0,0].text(0.04, 0.95, r'${\rm 1\%\ flux\ error}$',
        ha='left', va='top', transform=ax[0,0].transAxes)

#------------------------------------------------------------ 1% magnitude error
ax[0,1].hist(mag, bins=np.linspace(-0.25, 0.25, 50),
        histtype='stepfilled', fc='gray', alpha=0.5, density=True)
ax[0,1].plot(mag_fit, pdf_mag_fit, '-k')
ax[0,1].plot([0, 0], [0, 42], ':k', lw=1)
ax[0,1].set_xlim(-0.25, 0.25)
ax[0,1].set_xlabel(r'${\rm mag}$', fontsize = 12)
ax[0,1].set_ylabel(r'$p({\rm mag})$', fontsize = 12)
ax[0,1].text(0.04, 0.95, r'${\rm mag} = -2.5\log_{10}({\rm flux})$',
        ha='left', va='top', transform=ax[0,1].transAxes)

#------------------------------------------------------------ 20% flux error
ax[1,0].hist(flux25, bins=np.linspace(0, 2, 50),
        histtype='stepfilled', fc='gray', alpha=0.5, density=True)
ax[1,0].plot(flux_fit25, pdf_flux_fit25, '-k')
ax[1,0].plot([1, 1], [0, 2.1], ':k', lw=1)
ax[1,0].set_xlim(0, 2)
ax[1,0].set_ylim(0, 2.1)
ax[1,0].set_xlabel(r'${\rm flux}$', fontsize = 12)
ax[1,0].set_ylabel(r'$p({\rm flux})$', fontsize = 12)
ax[1,0].text(0.04, 0.95, r'${\rm 20\%\ flux\ error}$',
        ha='left', va='top', transform=ax[1,0].transAxes)

#------------------------------------------------------------ 20% magnitude error
ax[1,1].hist(mag25, bins=np.linspace(-1, 1, 50),
        histtype='stepfilled', fc='gray', alpha=0.5, density=True)
ax[1,1].plot(mag_fit25, pdf_mag_fit25, '-k')
ax[1,1].plot([0, 0], [0, 2], ':k', lw=1)
ax[1,1].set_xlim(-1, 1)
ax[1,1].set_xlabel(r'${\rm mag}$', fontsize = 12)
ax[1,1].set_ylabel(r'$p({\rm mag})$', fontsize = 12)
ax[1,1].text(0.04, 0.95, r'${\rm mag} = -2.5\log_{10}({\rm flux})$',
        ha='left', va='top', transform=ax[1,1].transAxes);

## Descriptive statistics

In [None]:
import scipy

# First let's just look at a bunch of in-built functions in scipy and numpy

x = np.random.random(100) # 100 random numbers 

q25, q50, q75 = np.percentile(x, [25, 50, 75])
mean = np.mean(x)
mode = scipy.stats.mode(x)
median = np.median(x) 
variance = np.var(x) 
standard_deviation = np.std(x)
skew = scipy.stats.skew(x)
kurtosis = scipy.stats.kurtosis(x)

print('25%, 50%, 70% quartiles:', q25, q50, q75)
print('mean:', mean)
print('mode:', mode)
print('median:', median)
print('variance:', variance)
print('standard deviation:', standard_deviation)
print('skew:', skew)
print('kurtosis:', kurtosis)

## Sample-based estimates of descriptive statistics (small and large N)

In [None]:
# np.std?

In [None]:
uncorrected = []
corrected = []
for x in range(1,5000):
    samples = np.random.normal(loc=5.0, scale=1.0, size = 1000) #small sample size, try 10
    uncorrected.append(np.std(samples, ddof=0))
    corrected.append(np.std(samples, ddof=1))

fig, ax = plt.subplots(1,2)
fig.set_size_inches(10,4)   
ax[0].hist(uncorrected)
ax[1].hist(corrected)
ax[0].set_title("Uncorrected Standard Deviations")
ax[1].set_title("Corrected Standard Deviations")

print('uncorrected:', np.mean(uncorrected))
print('corrected:', np.mean(corrected))

## Robust descriptive statistics

This we did not talk in the class in detail, but medians and the width estimator $\sigma_{G}=0.7413(q_{75}-q_{25})$ is a useful estimator when outliers exist.

In [None]:
from astroML import stats

np.random.seed(0)
x = np.random.normal(size=1000) # 1000 normally distributed points 
stats.sigmaG(x)

In [None]:
from scipy.stats import cauchy
  
normal = np.random.normal(loc=8.0, scale=1.0, size=100) # 100 samples from a Gaussian

a = np.random.normal(loc=8.0, scale=1.0, size=95) # 95 samples from a Gaussian
b = cauchy.rvs(loc=8.0, scale=20, size=5) # 5 samples from a Cauchy
normal_with_outliers = np.concatenate([a, b]) # combine to create Gaussian with outliers

In [None]:
labels = ['no outliers', 'with outliers']
means = [np.mean(normal),np.mean(normal_with_outliers)]
standard_deviations = [np.std(normal),np.std(normal_with_outliers)]
medians = [np.median(normal), np.median(normal_with_outliers)]
sigmaG = [stats.sigmaG(normal), stats.sigmaG(normal_with_outliers)]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(1,2)
fig.set_size_inches(10,4)   
rects1 = ax[0].bar(x - width/2, means, width, label='mean', color = 'steelblue')
rects2 = ax[0].bar(x + width/2, standard_deviations, width, label='std', color = 'darkorange')
rects3 = ax[1].bar(x - width/2, medians, width, label='median', color = 'olivedrab')
rects4 = ax[1].bar(x + width/2, sigmaG, width, label='sigmaG', color = 'gold')

titles = ["means and std's","medians and sigmaG's"]
rected = [(1,2),(3,4)]
for i in [0,1]:
    ax[i].set_ylabel('value')
    ax[i].set_xticks(x);
    ax[i].set_xticklabels(labels)
    ax[i].legend()
    ax[i].set_title(titles[i])

combined = means + standard_deviations + medians + sigmaG
ax[1].set_ylim([0, np.max(combined)+0.5])

ax[0].bar_label(rects1, padding=3)
ax[0].bar_label(rects2, padding=3)
ax[1].bar_label(rects3, padding=3)
ax[1].bar_label(rects4, padding=3)
fig.tight_layout()