In [None]:
import numpy as np
import seaborn as sns
import scipy.stats as stats
from matplotlib import pyplot as plt

## Sampling distribution

### The first example: sample of dice rolls

In [None]:
n_draws = 100
sample_size = 100
r = stats.randint(1,7)
mu = r.mean()
var = r.var()
sample_mean_realizations = np.empty(n_draws)
rv_realizations = np.empty(n_draws)


for i in range(n_draws):
    sample_realization = r.rvs(size=sample_size)
    sample_mean_realizations[i] = np.mean(sample_realization)
    rv_realizations[i] = r.rvs()

What do you see on the following picture?

This is a visual representation of the fact that each function from the sample, e.g. sample mean, also has a random nature! And each time you generate new sample, the value of the chosen function will be different.

Try to change ```sample_size``` variable and see what happens.

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12, 8))
ax1.scatter(np.arange(n_draws), rv_realizations, label=r'single RV realization, $x$')
ax1.axhline(r.mean(), color='green', label = 'True mean')
ax2.scatter(np.arange(n_draws), sample_mean_realizations, label=r'Sample mean realization, $\bar{x}$')
ax2.axhline(r.mean(), color='green', label = 'True mean')
ax1.legend()
ax2.legend()
ax1.set_title(r'Different random realizations of a single random variable $X$')
ax2.set_title(r'Different random realizations of sample mean $\bar{X}$, sample size = %d' % sample_size)
plt.show()

If on the previous picture we present just a few different realiations, in the following we generate much more samples and try to approximate the probability density function by plotting a histogram.

As well, try to change ```sample_size``` in below and see what happens with the histogram.

In [None]:
n_draws = 10000
sample_size = 10
n_bins = 50
r = stats.randint(1,7)
sample_mean_realizations = np.empty(n_draws)
rv_realizations = np.empty(n_draws)

for i in range(n_draws):
    sample_realization = r.rvs(size=sample_size)
    sample_mean_realizations[i] = np.mean(sample_realization)
    rv_realizations[i] = r.rvs()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12, 8))

sns.histplot(rv_realizations, bins=n_bins, ax=ax1).grid()

sns.histplot(sample_mean_realizations, bins=n_bins, ax=ax2).grid()
counts, _, _ = plt.hist(sample_mean_realizations, bins=n_bins, alpha=0.0)  # just in order to find out the scaling coefficient for PDF
plt.title('Histogram for standard normal variable')
#     plt.axvline(x=np.mean(z), label='Mean of Sample Means')

# scaling of normal PDF is needed, because histogram has large values on y-axis, and we need to fit them
x_space = np.linspace(mu - 3 * var,  mu + 3 * var, 1000)
ax2.plot(x_space, np.max(counts) * stats.norm.pdf(x_space, mu, np.sqrt(var / sample_size)) * np.sqrt(2 * np.pi * var / sample_size), label='Normal density')
ax1.legend()
ax2.legend()
ax1.set_title('Histogram for realizations of a single RV')
ax2.set_title(r'Histogram for realizations of sample mean $\bar{X}$, sample size = %d' % sample_size)
plt.show()

### The second example: sampling from normal distribution

In [None]:
r = stats.norm(5, 2)
n_draws = 100
sample_size = 2
mu = r.mean()
var = r.var()
std = r.std()
sample_mean_realizations = np.empty(n_draws)
rv_realizations = np.empty(n_draws)


for i in range(n_draws):
    sample_realization = r.rvs(size=sample_size)
    sample_mean_realizations[i] = np.mean(sample_realization)
    rv_realizations[i] = r.rvs()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12, 8))
ax1.scatter(np.arange(n_draws), rv_realizations, label=r'single RV realization, $x$')
ax1.axhline(r.mean(), color='green', label = 'True mean')
ax1.axhline(r.mean() + std, color='blue', label = r'$\mu + \sigma$')
ax1.axhline(r.mean() - std, color='blue', label = r'$\mu - \sigma$')

ax2.scatter(np.arange(n_draws), sample_mean_realizations, label=r'Sample mean realization, $\bar{x}$')
ax2.axhline(r.mean(), color='green', label = 'True mean')
ax2.axhline(r.mean() + std, color='blue', label = r'$\mu + \sigma$')
ax2.axhline(r.mean() - std, color='blue', label = r'$\mu - \sigma$')
ax1.legend(loc='best')
ax2.legend(loc='best')
ax1.set_title(r'Different random realizations of a single random variable $X \sim N$({},{})'.format(mu,var))
ax2.set_title(r'Different random realizations of sample mean $\bar{X}$, sample size = %d' % sample_size)
plt.show()

In [None]:
n_draws = 10000
sample_size = 30
n_bins = 50
sample_mean_realizations = np.empty(n_draws)
rv_realizations = np.empty(n_draws)

for i in range(n_draws):
    sample_realization = r.rvs(size=sample_size)
    sample_mean_realizations[i] = np.mean(sample_realization)
    rv_realizations[i] = r.rvs()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12, 8))

sns.histplot(rv_realizations, bins=n_bins, ax=ax1).grid()

sns.histplot(sample_mean_realizations, bins=n_bins, ax=ax2).grid()
counts, _, _ = plt.hist(sample_mean_realizations, bins=n_bins, alpha=0.0)  # just in order to find out the scaling coefficient for PDF
plt.title('Histogram for standard normal variable')
#     plt.axvline(x=np.mean(z), label='Mean of Sample Means')

# scaling of normal PDF is needed, because histogram has large values on y-axis, and we need to fit them
x_space = np.linspace(mu - 3 * var,  mu + 3 * var, 1000)
ax2.plot(x_space, np.max(counts) * stats.norm.pdf(x_space, mu, np.sqrt(var / sample_size)) * np.sqrt(2 * np.pi * var / sample_size), label='Normal density')
ax1.legend()
ax2.legend()
ax1.set_title('Histogram for realizations of a single RV')
ax2.set_title(r'Histogram for realizations of sample mean $\bar{X}$, sample size = %d' % sample_size)
plt.show()