# Stats demo for appendix

_Alex Malz (GCCL@RUB)_

In [None]:
import numpy as np
import scipy.stats as sps
import sys

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Color map
rainbow = cm = plt.get_cmap('plasma_r')
cNorm  = colors.LogNorm(vmin=1, vmax=50) #colors.Normalize(vmin=0, vmax=50)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=rainbow)
color_map = scalarMap.to_rgba(np.arange(1, 50))

## Preprocessing

### How to turn samples into discrete distribution

In [None]:
eps = 2. * sys.float_info.min

def safe_log(arr, threshold=eps):
    """
    Takes the natural logarithm of an array that might contain zeros.

    Parameters
    ----------
    arr: ndarray, float
        array of values to be logged
    threshold: float, optional
        small, positive value to replace zeros and negative numbers

    Returns
    -------
    logged: ndarray
        logged values, with small value replacing un-loggable values
    """
    arr = np.asarray(arr)
    arr[arr < threshold] = threshold
    logged = np.log(arr)
    return logged

# def make_kde(Xgrid, Ygrid, Xsamps, Ysamps, to_log=False, save=None):
#     positions = np.vstack([Xgrid.ravel(), Ygrid.ravel()])
#     values = np.vstack([Xsamps, Ysamps])
#     kernel = sps.gaussian_kde(values, bw_method='scott')
#     Z = np.reshape(kernel(positions).T, Xgrid.shape)
#     if to_log:
#         return safe_log(Z)
#     else:
#         return Z
        
# TODO: normalize up here before log!

# def make_grid(x, y, x_ngrid=100, y_ngrid=100):
#     x_min = x.min()#-1.2
#     x_max = x.max()#-0.8
#     y_min = y.min()#0.2
#     y_max = y.max()#0.4

#     x_grid, y_grid = np.mgrid[x_min:x_max:x_ngrid*1.j, y_min:y_max:y_ngrid*1.j]
#     x_vec, y_vec = x_grid[:, 0], y_grid[0, :]
#     dx = (x_max - x_min) / (x_ngrid - 1)
#     dy = (y_max - y_min) / (y_ngrid - 1)

#     return(((x_min, y_min), (x_max, y_max)), (x_grid, y_grid), (x_vec, y_vec), (dx, dy))

## Metrics to compare

### RMSE

### KLD

In [None]:
# stolen from chippr
def calculate_kld(lpe, lqe, dx, from_log=False, vb=True):
    """
    Calculates the Kullback-Leibler Divergence between two N-dimensional PDFs 
    evaluated on a shared, regular grid (sorry, too lazy to deal with irregular grid)

    Parameters
    ----------
    lpe: numpy.ndarray, float
        log-probability distribution evaluated on a grid whose distance from `q`
        will be calculated.
    lqe: numpy.ndarray, float
        log-probability distribution evaluated on a grid whose distance to `p` will
        be calculated.
    dx: numpy.ndarray, float
        separation of grid values in each dimension
    from_log: boolean, optional
        if False, lpe, lqe are probability distributions, not log-probability distributions
    vb: boolean, optional
        report on progress to stdout?

    Returns
    -------
    Dpq: float
        the value of the Kullback-Leibler Divergence from `q` to `p`
    """
    # Normalize the evaluations, so that the integrals can be done
    gridnorm = np.ones_like(lpe) * np.prod(dx)
    if from_log:
        pe = np.exp(lpe)
        qe = np.exp(lqe)
#     print(np.prod(dx))
#     print(gridnorm)
    else:
        pe = lpe
        qe = lqe
    pi = np.sum(pe * gridnorm)
    qi = np.sum(qe * gridnorm)
    # (very approximately!) by simple summation:
    pn = pe / pi
    qn = qe / qi
    # Compute the log of the normalized PDFs
    logp = safe_log(pn)
    logq = safe_log(qn)
    # Calculate the KLD from q to p
    Dpq = np.sum(pn * (logp - logq))
#     if np.isnan(Dpq):
#         return((lpe, lqe, dx))
    return Dpq

### EMD

## Gaussian cases

In [None]:
base_loc = 0.
base_scale = 1.
dx = 0.01

In [None]:
P = sps.norm(loc=base_loc, scale=base_scale)
grid_ref = np.arange(base_loc-10.*base_scale, base_loc+10.*base_scale, dx)
kde_ref = P.pdf(grid_ref)
plt.plot(grid_ref, kde_ref)

parameterize by:
- precision $r^{-1} \equiv \frac{\sigma_{0}}{\sigma}$ ($\approx \exp[-2 KLD]$)
- tension $t \equiv \frac{\Delta \mu}{\sqrt{\sigma_{0}^{2} + \sigma^{2}}}$ ($\approx \sqrt{KLD}$)

### variable precision

In [None]:
widths = np.linspace(-2., 2., 5)
print(widths)
RMS, KLD, EMD = np.empty_like(widths), np.empty_like(widths), np.empty_like(widths)

for k, sigma in enumerate(widths):
    Q = sps.norm(loc=base_loc, scale=(10.*base_scale)**sigma)
    kde_comp = Q.pdf(grid_ref)
    plt.plot(grid_ref, kde_comp)
    plt.plot(grid_ref, kde_ref)
    plt.show()
#     RMS = 
    KLD[k] = calculate_kld(kde_ref, kde_comp, dx)
#     EMD = 

### variable tension