# Comparison between different estimators of distributions

Currently compares histogram and KDE when estimating the form factor of a SANS sphere.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.nonparametric.kde import KDEUnivariate, kernel_switch
%matplotlib widget

In [None]:
rng = np.random.default_rng(123)

In [None]:
def sans_sphere(q, r):
    qr = q * r
    return np.where(q==0, 1, 9 * (np.sin(qr) - qr * np.cos(qr))**2 / qr**6)

In [None]:
pdf = lambda x: sans_sphere(x, 2)
x = np.linspace(0, 5, 1000)

In [None]:
def generate(rng, pdf, size):
    x_max = 2
    v_bound = np.sqrt(pdf(np.sqrt(x_max))) * np.sqrt(x_max)
    umax, vmin, vmax = np.sqrt(pdf(0)), 0, v_bound
    return stats.rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=size, random_state=rng)
samples = {n: generate(rng, pdf, n) for n in (100, 1000, 10000, 100000)}

In [None]:
def make_bin_edges(sample, xmin, xmax):
    """
    Build bins for histogramming using the Freedman–Diaconis rule.
    """
    width = 2 * stats.iqr(sample) / len(sample)**(1/3)
    return np.arange(xmin, xmax+width, width)

In [None]:
def histogram(sample):
    bin_edges = make_bin_edges(sample, xmin=x[0], xmax=x[-1])
    bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2
    return bin_centres, np.histogram(sample, bins=bin_edges, density=True)[0]

hists = {n: histogram(sample) for n, sample in samples.items()}

In [None]:
def kde(sample):
    estimator = KDEUnivariate(sample)
    estimator.fit()
    return estimator.evaluate(x)
    
kdes = {n: kde(sample) for n, sample in samples.items()}

In [None]:
fig, ax = plt.subplots(1)

for i, (n, (q, density)) in enumerate(hists.items()):
    ax.plot(q, density, ls='', marker='.', markersize=3, c=f'C{i}', label=f'hist({n})')
    
for i, (n, density) in enumerate(kdes.items()):
    ax.plot(x, density, c=f'C{i}', label=f'kde({n})')

ax.plot(x, pdf(x), c='k', label='pdf')
ax.set_yscale('log')
ax.legend()