# Comparison between different estimators of distributions

Currently compares histogram and KDE when estimating the form factor of a SANS sphere.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy
from statsmodels.nonparametric.kde import KDEUnivariate, kernel_switch
from halt.models import Sphere, sphere_pdf
%matplotlib widget

In [2]:
rng = np.random.default_rng(174)

In [3]:
true_distribution = Sphere(r=2)

In [4]:
x = np.linspace(0, 5, 1000)

In [5]:
sample_sizes = 10 ** np.array([2, 3, 4, 5])
full_sample = true_distribution.rvs(size=sample_sizes[-1], random_state=rng)
samples = {n: full_sample[:n] for n in sample_sizes}

In [6]:
def make_bin_edges(sample, xmin, xmax):
    """
    Build bins for histogramming using the Freedman–Diaconis rule.
    """
    width = 2 * stats.iqr(sample) / len(sample)**(1/3)
    return np.arange(xmin, xmax+width, width)

In [7]:
def histogram(sample):
    bin_edges = make_bin_edges(sample, xmin=x[0], xmax=x[-1])
    bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2
    return bin_centres, np.histogram(sample, bins=bin_edges, density=True)[0]

hists = {n: histogram(sample) for n, sample in samples.items()}

In [8]:
def kde(sample):
    estimator = KDEUnivariate(sample)
    estimator.fit()
    return estimator.evaluate(x)
    
kdes = {n: kde(sample) for n, sample in samples.items()}

In [15]:
fig, ax = plt.subplots(1)

for i, (n, (q, density)) in enumerate(hists.items()):
    ax.plot(q, density, ls='', marker='.', markersize=3, c=f'C{i}', label=f'hist({n})')
    
for i, (n, density) in enumerate(kdes.items()):
    ax.plot(x, density, c=f'C{i}', label=f'kde({n})')

ax.plot(x, true_distribution.pdf(x), c='k', label='pdf')
ax.set_xlabel('q')
ax.set_ylabel('density')
ax.set_yscale('log')
ax.set_ylim((1e-6, 2))
ax.legend()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x7fbfd52b8bb0>

In [10]:
def fit_model(q, r):
    return sphere_pdf(q, r)

def unary_fit(*args, **kwargs):
    (p,), ((e,),) = scipy.optimize.curve_fit(*args, **kwargs)
    return p, e

In [18]:
fig, ax = plt.subplots(1)

hist_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=q, ydata=density, p0=[1]))
                       for n, (q, density) in hists.items()]))
ax.errorbar(*hist_fits, label='hist')

kde_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=x, ydata=density, p0=[1]))
                      for n, density in kdes.items()]))
ax.errorbar(*kde_fits, label='kde')

trunc_kde_fits = list(zip(*[(n, *unary_fit(fit_model, xdata=x[30:], ydata=density[30:], p0=[1]))
                            for n, density in kdes.items()]))
ax.errorbar(*trunc_kde_fits, label='truncated kde')

ax.axhline(true_distribution.r, c='k', label='true')

ax.set_xlabel('n')
ax.set_ylabel('r')
ax.set_xscale('log')
ax.legend()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x7fbfd4f49fa0>