In [None]:
import os
import numpy as np
import scipy
import scipy.stats as stats
import matplotlib.pyplot as plt
import pymc3 as pm
import arviz as az
import seaborn as sns

In [None]:
sns.set()

# Exercise session 6: WAIC

## WAIC Definition

Let us denote by $\Theta$ the samples $\theta_0, \theta_1, \dots, \theta_{S-1}$ in the trace and by $y$ the measurement(s). 

* The log-pointwise-predictive-density $\rm lppd$ is defined as:

$$ {\rm lppd}(y, \Theta) = \sum_i \log \frac{1}{S} \sum_s p(y_i | \Theta_s),$$

* The effective number of parameters $p_{{\rm waic}}$ is defined as:

$$ p_{{\rm waic}} = \sum_i {\rm var}_{\theta} \log p(y_i|\theta)$$

* The WAIC criterion (the lower, the better) is:

$${\rm WAIC}(y, \Theta) = -2(\rm{lppd} - p_{\rm waic}).$$

* The expected log pointwise predictive density $\rm{elpd}$ (the higher, the better) is defined as:

$$\rm{elpd} = (\rm{lppd} - p_{\rm waic}).$$

## WAIC for the beta-binomial

Let us implement and test the WAIC on the Beta-Binomial model: the *Hello World* probabilistic programming

We us consider a coin that falls heads with probability $\theta=0.25$ (assumed unknown). 

Our prior knowledge of $\theta$ is encoded in a Beta distributions with parameters $a=20$ and $b=10$:

$$ p_{\rm prior}(\theta) = \frac{1}{B(a,b)} \theta^{a-1} (1-\theta)^{b-1}, \qquad \theta \in [0, 1],$$

where $B(a,b)$ is a proper normalization constant such that $\int_{0}^1 p_{\rm prior}(\theta) \; d \theta = 1.$ 

We toss the coin $n$ times and measure a number $y$ of heads. How does our belief of $\theta$ change with the measurement?

In [None]:
a = 20 # prior: parameter a
b = 10 # prior: parameter b
n = 50 # likelihood: number of tosses
y = 15 # likelihood: number of HEADs observed. (Fixed in this example. It could be sampled form a binomial instead)

In [None]:
with pm.Model() as beta_binomial:
    theta = pm.Beta("theta", alpha=a, beta=b)
    y_obs = pm.Binomial("y_obs", n=n, p=theta, observed=y)
    trace_bb = pm.sample(1000, random_seed=123, return_inferencedata=True)


In [None]:
with beta_binomial:
    display(az.summary(trace_bb))

In [None]:
az.plot_trace(trace_bb);

In [None]:
az.plot_posterior(trace_bb, color="b");

## WAIC computation details

Let us consider a scalar observation $y$ (as in the beta-binomial example). In the definitions above, the outer summations on $i$ disappear. 

Let us denote by ``lik_vec`` and ``loglik_vec`` vectors containing, respectively, the likelihood and log-likelihood of $y$, for the samples of $\theta_s \in \Theta$:

$$\text{loglik_vec} = [\log p(y | \theta_0)\;\; \log p(y | \theta_1),\;\; \dots,\;\; \log p(y | \theta_{S-1})]^\top$$
$$\text{lik_vec} = [p(y | \theta_0)\;\; p(y | \theta_1),\;\; \dots,\;\; p(y | \theta_{S-1})]^\top$$

* $p_{{\rm waic}}$ can be computed as the sample variance of ``loglik_vec``. In Python:

``p_waic = np.var(loglik_vec)``

* ${\rm lppd}$ could be computed (in principle) as the logarithm of the average of ``lik_vec``. In Python:

``lppd = np.log(np.mean(lik_vec))``

* For better numerical precision, ${\rm lppd}$ computation is typically worked out in the log-domain, exploiting the logsumexp trick:

``lppd = scipy.special.logsumexp(log_lik_vec) - np.log(S)``

This exploits the identity:

$$\log {{\sum_s p(y_i | \theta_s)}} =  \log\left(\sum_s e^{\log p(y_i | \theta_s)}\right),$$
where at the right-hand-side we recognize the ``logsumexp`` operation applied to the vector ``loglik_vec``.

Thus, in practice, all we need to compute the WAIC is the log-likelihood of the samples.

## WAIC manual computation

* Write the likelihood and the log-likelihood function of the observations in closed form. Note: for these calculations, we cannot disregard multiplicative/additive terms not depending on $\theta$.

The likelihood is:
$$p(y | \theta) = {{n}\choose{y}} \theta^{y} \cdot (1-\theta)^{n-y}.$$

The log-likelihood is:
$$\log p(y | \theta) = \log {{n}\choose{y}} + y \log(\theta) + (n-y)\log(1-\theta).$$

* Obtain the likelihood and the log-likelihood as python functions (including multiplicative/additive factors)

In [None]:
def lik(theta):
    return scipy.special.binom(n, y) * (theta ** y) * (1 - theta)**(n-y)

In [None]:
def log_lik(theta):
    return np.log(scipy.special.binom(n, y)) + y*np.log(theta) + (n-y)*np.log(1 - theta)

* Extract the samples $\Theta$ from the trace.

In [None]:
theta_trace = np.array(trace_bb.posterior.theta).ravel()
S = theta_trace.shape[0]

* Compute $\rm{lppd}$ and $p_{\rm waic}$

In [None]:
log_lik_vec = log_lik(theta_trace)
# log_lik_pm = np.array(trace_bb.log_likelihood.y_obs).ravel()

In [None]:
# lppd = np.log(np.mean(lik(theta_trace)))
lppd = scipy.special.logsumexp(log_lik_vec) - np.log(S) # sum is equivalent to logsumexp in log domain...

In [None]:
p_waic = np.var(log_lik_vec) # correct
p_waic

* Compute $\rm{waic}$ and ${\rm elpd}$

In [None]:
waic = -2*(lppd - p_waic); waic

In [None]:
elpd = (lppd - p_waic); elpd

## WAIC Computation with arviz

arviz computes these quantity automatically:

In [None]:
with beta_binomial:
    display(az.waic(trace_bb))

## WAIC for the multiparameter model

In [None]:
y = np.loadtxt(os.path.join("..", "data", "chemical_shifts.txt"))
y

In [None]:
az.plot_kde(y, rug="True");

In [None]:
with pm.Model() as model_gaussian:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sd=10)
    y_obs = pm.Normal('y_obs', mu=mu, sd=sigma, observed=y)
    trace_gaussian = pm.sample(1000, return_inferencedata=True)

In [None]:
az.plot_trace(trace_gaussian);

In [None]:
with model_gaussian:
    display(az.summary(trace_gaussian))

In [None]:
with model_gaussian:
    # select 500 samples <mu_s ,sigma_s> from the trace
    # for each samples, compute a draw N(mu_s ,sigma_s) 
    # The ppc variable is a dictionary, with the keys being the name of the observed variable in our model and the values an array of shape (samples, size). 
    # The dictionary allows dealing with models with more than one observed variable. 
    ppc = pm.sample_posterior_predictive(trace_gaussian, samples=500)

In [None]:
az.plot_ppc(az.from_pymc3(posterior_predictive=ppc, model=model_gaussian));

The posterior predictive plot is not very convincing (outliers are not modeled)

In [None]:
with beta_binomial:
    display(az.waic(trace_gaussian))

Let us compute the WAIC manually:

$$\mathcal{L}(\theta) = p(y | \theta) = \frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{(x-\mu)^2}{2\sigma^2}}$$

$$\mathcal{\ell}(\theta) = \log p(y|\theta) = -\log(\sigma) -\frac{1}{2} \log ({2 \pi}) - \frac{(y-\mu)^2}{2\sigma^2}$$

In [None]:
def log_lik_fun(mu, sigma, y):
    return -np.log(sigma) -1/2*np.log(2*np.pi) - (y - mu)**2/(2*sigma**2) 

In [None]:
mu_mc = np.array(trace_gaussian.posterior["mu"]).ravel()[:, np.newaxis] # (n_samples, 1)
sigma_mc = np.array(trace_gaussian.posterior["sigma"]).ravel()[:, np.newaxis] # (n_samples, 1)
y_ = y[np.newaxis, :]

In [None]:
mu_mc.shape, sigma_mc.shape, y_.shape

In [None]:
log_lik_recalc = log_lik_fun(mu_mc, sigma_mc, y_)

In [None]:
log_lik_vec = np.array(trace_gaussian.log_likelihood.y_obs)
log_lik_vec = log_lik_vec.reshape(-1, log_lik_vec.shape[-1]) # (mc sample, data sample)

In [None]:
log_lik_vec.shape

In [None]:
p_waic = np.var(log_lik_vec, axis=0, ddof=1) # correct
p_waic = np.sum(p_waic)
p_waic

In [None]:
lppd = scipy.special.logsumexp(log_lik_vec, axis=0) - np.log(S) # sum is equivalent to logsumexp in log domain...
lppd = np.sum(lppd)
lppd

In [None]:
lppd - p_waic

In [None]:
with pm.Model() as model_student:
    μ = pm.Uniform('μ', 40, 75)
    σ = pm.HalfNormal('σ', sd=10)
    ν = pm.Exponential('ν', 1/30)
    y = pm.StudentT('y', mu=μ, sd=σ, nu=ν, observed=y)
    trace_student = pm.sample(1000, return_inferencedata=True)

In [None]:
az.plot_trace(trace_student);

In [None]:
with model_student:
    display(az.summary(trace_student))

In [None]:
with model_student:
    # select 500 samples <mu_s ,sigma_s> from the trace
    # for each samples, compute a draw N(mu_s ,sigma_s) 
    # The ppc variable is a dictionary, with the keys being the name of the observed variable in our model and the values an array of shape (samples, size). 
    # The dictionary allows dealing with models with more than one observed variable. 
    ppc = pm.sample_posterior_predictive(trace_student, samples=500)

In [None]:
az.plot_ppc(az.from_pymc3(posterior_predictive=ppc, model=model_student));

In [None]:
#with factory_pooled, factory_separate, factory_hierarchical:
comp_df = az.compare({"model_gaussian": trace_gaussian,
                      "model_student": trace_student}, ic="waic", method="BB-pseudo-BMA")
comp_df

In [None]:
az.plot_compare(comp_df);