This notebooks checks the approximation quality of a distribution derived from quantile values.

In [2]:
output_dir = "./figs/distribution_quality/sample_size=75"

In [3]:
import numpy as np
import math
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib import rcParams
from naslib.optimizers.bananas.distribution import PointwiseInterpolatedDist

In [4]:
plt.style.use('ggplot')
rcParams['axes.titlepad'] = 20 
rcParams['font.size'] = 9

In [5]:
def fit_distribution(samples, num_quantiles) -> PointwiseInterpolatedDist:
    sample_size = len(samples)
    # estimate standard deviation and quantiles
    std = np.std(samples)

    pk = np.linspace(0, 1, num_quantiles + 1)
    qk = []
    for p in pk:
        adj_p = min(math.ceil((sample_size + 1) * p) / sample_size, 1.0)
        qk.append(np.quantile(samples, adj_p))
    
    return PointwiseInterpolatedDist(values=(pk, qk), std=std)


In [99]:
num_quantiles = 20
sample_size = 75
true_dist = stats.norm(loc=0, scale=2)
true_dist.name = "Gaussian Distribution"

samples = true_dist.rvs(size=sample_size)
est_dist = fit_distribution(samples=samples, num_quantiles=num_quantiles)

In [None]:
est_dist.densities

In [None]:
est_dist.intervals

In [102]:
def _mid(interval):
    left, right = interval
    return (right + left) / 2

interval_centers = list(map(_mid, est_dist.intervals))

### Density Plot

In [103]:
dist_name = true_dist.name
mean = true_dist.mean()
std = true_dist.std()

In [None]:
fig = plt.figure()

title = f"{dist_name} (mean={mean}, std={std})"
plt.title(title)
# plot true distribution
x = np.linspace(mean - 3 * std, mean + 3 * std, 100)
plt.plot(x, true_dist.pdf(x), label="Actual", alpha=0.8)

# plot estimated distribution 
plt.step(interval_centers, est_dist.densities, where="mid", label="Estimated", alpha=0.8)
plt.legend(loc="upper right")
plt.xlabel("x")
plt.ylabel("pdf(x)")
plt.tight_layout()

# save
# fig.savefig(output_dir + f"/{title}_pdf_plot.pdf")

### CDF Plot

In [None]:
fig = plt.figure()

title = f"{dist_name} (mean={mean}, std={std})"
plt.title(title)
# plot true distribution
x = np.linspace(mean - 3 * std, mean + 3 * std, 100)
plt.plot(x, true_dist.cdf(x), label="Actual", alpha=0.8)

# plot estimated distribution 
cdfs = [est_dist.cdf(i) for i in x]
plt.plot(x, cdfs, label="Estimated", alpha=0.8)
plt.legend(loc="lower right")
plt.xlabel("x")
plt.ylabel("cdf(x)")
plt.tight_layout()

# save
# fig.savefig(output_dir + f"/{title}_cdf_plot.pdf")

### Sampling Histogram Plot

In [None]:
bins = 20
n_samples = len(samples)

fig = plt.figure()

title = f"{dist_name} (mean={mean}, std={std})"
plt.title(title)
# plot true distribution
plt.hist(true_dist.rvs(size=n_samples), bins=bins, alpha=0.7, label="Actual")

# plot estimated distribution 
plt.hist(est_dist.rvs(size=n_samples), bins=bins, alpha=0.7, label=f"Estimated")


plt.legend(loc="upper right") 
plt.xlabel("value")
plt.ylabel("Frequency")
plt.tight_layout()

In [94]:
# save
fig.savefig(output_dir + f"/{title}_rvs_plot.pdf")