# Comparison between different estimators of distributions

Currently compares histogram and KDE when estimating the form factor of a SANS sphere.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy
from statsmodels.nonparametric.kde import KDEUnivariate, kernel_switch
from halt.models import sphere, sphere_pdf
%matplotlib widget

In [None]:
rng = np.random.default_rng(174)

In [None]:
true_distribution = sphere(r=2.0, loc=0.0, scale=1.0)

In [None]:
x = np.linspace(0, 5, 1000)

In [None]:
sample_sizes = [100, 250, 500, 750, 1000, 5000]
full_sample = true_distribution.rvs(size=sample_sizes[-1], random_state=rng)
samples = {n: full_sample[:n] for n in sample_sizes}

In [None]:
def make_bin_edges(sample, xmin, xmax):
    """
    Build bins for histogramming using the Freedman–Diaconis rule.
    """
    width = 2 * stats.iqr(sample) / len(sample)**(1/3)
    return np.arange(xmin, xmax+width, width)

In [None]:
def histogram(sample):
    bin_edges = make_bin_edges(sample, xmin=x[0], xmax=x[-1])
    bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2
    return bin_centres, np.histogram(sample, bins=bin_edges, density=True)[0]

hists = {n: histogram(sample) for n, sample in samples.items()}

In [None]:
def kde(sample):
    estimator = KDEUnivariate(sample)
    estimator.fit()
    return estimator.evaluate(x)
    
kdes = {n: kde(sample) for n, sample in samples.items()}

In [None]:
def maximum_likelihood(sample):
    return sphere.fit(sample, floc=0.0, fscale=1.0)[0]

mles = {n: maximum_likelihood(sample) for n, sample in samples.items()}

In [None]:
n_plots = len(samples)
fig, axs = plt.subplots(nrows=n_plots//3+min(n_plots%3,1), ncols=min(n_plots, 3),
                        squeeze=False, sharex=True, sharey=True, gridspec_kw={'wspace':0, 'hspace':0})

first = True
for ax, n in zip(axs.flat, samples.keys()):
    ax.plot(x, true_distribution.pdf(x), c='k', label='pdf')
    ax.step(*hists[n], where='mid', label=f'hist({n})')
    ax.plot(x, sphere.pdf(x, r=mles[n]), ls='--', label=f'MLE({n})')
    ax.plot(x, kdes[n], label=f'kde({n})')

    ax.set_yscale('log')
    ax.set_ylim((1e-6, 2))

for ax in axs[:, 0]:
    ax.set_ylabel('density')
    
for ax in axs[-1, :]:
    ax.set_xlabel('q')
    
axs[0, 0].legend()
fig.tight_layout()

In [None]:
def fit_model(q, r):
    return sphere_pdf(q, r)

def unary_fit(*args, **kwargs):
    (p,), ((e,),) = scipy.optimize.curve_fit(*args, **kwargs)
    return p, e

In [None]:
fig, ax = plt.subplots(1)

ax.axhline(true_distribution.kwds['r'], c='k', label='true')

ml_fits = list(zip(*mles.items()))
ax.plot(*ml_fits, label='ML')

hist_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=q, ydata=density, p0=[1]))
                       for n, (q, density) in hists.items()]))
ax.errorbar(*hist_fits, label='hist')

kde_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=x, ydata=density, p0=[1]))
                      for n, density in kdes.items()]))
ax.errorbar(*kde_fits, label='kde')

trunc_kde_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=x[30:], ydata=density[30:], p0=[1]))
                            for n, density in kdes.items()]))
ax.errorbar(*trunc_kde_fits, label='truncated kde')

ax.set_xlabel('n')
ax.set_ylabel('r')
ax.set_xscale('log')
ax.legend()