In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib import cm
import arviz as az
import pymc3 as pm

# Assignment 1: Toxicity bioassay

## Problem and data

A **bioassay** is biochemical test to estimate the potency of a sample compound. A typical bioassay involves a stimulus (ex. drugs) applied to a subject (ex. animals, tissues, plants). The corresponding response (ex. death) of the subject is thereby triggered and measured (Wikipedia).

The following bioassay taken from Racine et al. (1986) is meant to evaluate the toxicity of a drug on animals. The effect of the drug is evaluated at $N=4$ dose levels. Each dose level $x_i$ is administered to a batch of $n_i$ animals. The number of deaths $y_i$ is the observed response.

| |Dose $x_i$ (log g/ml) | Number of animals $n_i$ | Number of deaths $y_i$ |
| ---|--- | --- | --- |
| 1|-0.86 | 5 | 0 |
| 2|-0.30 | 5 | 1 |
| 3|-0.05 | 5 | 3 |
| 4|0.73  | 5 | 5 |


For instance, the dose level $x_3=-0.05$ has been administered at $n_3=5$ animals. Out of the 5 animals, $y_3=3$ died. <br/>
Note that the dose $x_i$ is measured on a logarithmic scale. Thus, negative concentration levels are present.

We define for convenience (both as math symbols and as Python objects) the vectors $x$, $n$, and $y$ containing the quantities of the corresponding columns.

In [None]:
x = np.array([-0.86, -0.30, -0.05, 0.73]) # dose levels
n = np.array([5., 5., 5., 5.]) # number of subjects per dose level
y = np.array([0, 1, 3, 5]) # number of deaths per dose level

## Modeling assumptions

For the probabilistic model, we make the following assumptions:

1. The outcome of the $n_i$ animals within each group $i$ are *independent*. Each animal in the group has probability $p_i$ of death.

2. The probability of death $p_i$ depends on the dose $x_i$ as follows:
    $$p_i = \rm{sigm}(\alpha + \beta x_i),$$ 
    where 
    \begin{align*}
    \rm{sigm}(z) = \frac{1}{1 + e^{-z}}.
    \end{align*}
3. The prior probability of the parameters 
$\theta = \begin{bmatrix}
\alpha \\
\beta
\end{bmatrix}$
is Gaussian: 
\begin{align}
\alpha &\sim \mathcal{N}(\mu_\alpha, \sigma^2_\alpha), \qquad \mu_\alpha = 0, \sigma_\alpha=2\\
\beta &\sim \mathcal{N}(\mu_\beta, \sigma^2_\beta), \qquad \mu_\beta=10, \sigma_\beta=10.
\end{align}
4. The outcomes in the four groups are independent of each other, given $\theta$.

## 1.1: Probabilistic model

* Derive and comment the full probabilistic model.

Putting together the probabilistic assumptions 1-3, we obtain:

\begin{align*}
y_i | p_i &\sim  \mathrm{Binomial}(n_i, \rm{sigm}(\alpha + \beta x_i))\\
%p_i &= \rm{sigm}(\alpha + \beta x_i) \\
\alpha &\sim \mathcal{N}(0, 4)\\
\beta &\sim \mathcal{N}(10, 100).
\end{align*}

Furthermore, according to assumption 4:

$$P(y|\theta) = \prod_i P(y_i|\theta)$$

## 1.2: Maximum Likelihood estimation 


* Derive an analytical expression of the likelihood function $\mathcal{L}(\theta) = P(y|\theta)$.

The likelihood function $\mathcal{L}(\theta)$ is $P(y|\theta)$, seen as a function of $\theta$, with $y$ fixed to the observed outcome. <br/>Since the individual observations $y_i$ are independent, we have:

$$\mathcal{L}(\theta) = P(y|\theta) = \prod_{i=1}^N {{n_i}\choose{y_i}} \mathrm{sigm}(\alpha + \beta x_i)^{y_i} \cdot (1- \mathrm{sigm}(\alpha + \beta x_i))^{n_i - y_i}$$

* Write a Python function corresponding to the likelihood function $\mathcal{L}(\theta)$. Ignore multiplicative factors which do not depend on $\theta$.

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def lik(alpha, beta):
    pass
    # ... TODO

In [None]:
# Likelihood of the 4 observations (neglecting the multiplicative factor).
# The overall likelihood is the product of all terms.

def lik(alpha, beta):
    alpha = np.atleast_1d(alpha)[..., np.newaxis]
    beta  = np.atleast_1d(beta)[..., np.newaxis]
    gamma = sigmoid(alpha+beta*x)
    lik = gamma**y * (1-gamma)**(n-y)
    return np.prod(lik, axis=-1)

* Visualize the likelihood function in 2D and comment the obtained figure. 

   Hints:
    * you may use the `pcolormesh` function of `matplotlib`
    * appropriate ranges for $\alpha$ and $\beta$ are $[-4, 8]$ and $[-10, 40]$, respectively
    * an appropriate step size for both $\alpha$ and $\beta$ is 0.01

In [None]:
dalpha = 0.01
dbeta = 0.01
ALPHA = np.arange(-4, 8, dalpha)
BETA = np.arange(-10, 40, dbeta)
AA, BB = np.meshgrid(ALPHA, BETA, indexing='xy')
AABB = np.stack((AA, BB), axis=-1)
LL = lik(AA, BB)

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, LL, cmap=cm.coolwarm, shading='auto')
#plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Likelihood");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

Parameters $\alpha$ and $\beta$ are positively correlated, the maximum likelihood estimate is around (1, 8), the numerical scale is small (0 ... 0.0025), but not extremely critical.

* Derive an analytical expression of the log-likelihood function $\ell(\theta)$. 

In this case, the likelihood is numerically well-posed (not too many samples, not too many multiplications). The scale 0-0.05 is not too bad! In general, it is better to work with logarithms. Let us compute the *log-likelihood* $\ell(\theta)$:

$$\ell(\theta) = \log \mathcal{L}(\theta) = \sum_i {{n_i}\choose{y_i}} + \sum_i y_i \log \mathrm{sigm}(\alpha + \beta x_i) +  (n_i - y_i) \log (1- \mathrm{sigm}(\alpha + \beta x_i)).$$

The constant term $\sum_i {{n_i}\choose{y_i}}$ may be ignored.

* Write a Python function corresponding to the log-likelihood function $\ell(\theta)$ (possibly up to an additive factor).

In [None]:
def log_lik(alpha, beta):
    alpha = np.atleast_1d(alpha)[..., np.newaxis] # useful to handle grid data
    beta  = np.atleast_1d(beta)[..., np.newaxis] # useful to handle grid data
    gamma = sigmoid(alpha+beta*x)
    #log_lik = y*np.log(gamma) + (n-y)*np.log(1-gamma)
    # nan_to_num handles the multiplication 0*np.inf and set it to 0, as required in our case...
    log_lik = np.nan_to_num(y*np.log(gamma), nan=0) + np.nan_to_num((n-y)*np.log(1-gamma), nan=0)
    return np.sum(log_lik, axis=-1)

* Visualize the log-likelihood function in 2D and comment the obtained figure. 

In [None]:
LOG_LL = log_lik(AA, BB)
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, LOG_LL, cmap=cm.coolwarm, shading='auto')
#plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Likelihood");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

* Compute the maximum likelihood estimate $\alpha^{\rm ml}, \beta^{\rm ml}$ of the parameters $\alpha, \beta$ through numerical optimizations. 

    Hints:
     * You may use the Python function `scipy.optimize.minimize`. 
     * You may look at the figures above to define a good starting point for optimization 
     * You may either minimize the likelihood or the log-likelihood. What is your choice?

In [None]:
from scipy.optimize import minimize

log_lik_theta = lambda theta: log_lik(theta[0], theta[1])
nll_theta = lambda theta: -log_lik_theta(theta) # negative log-likelihood function.
res = minimize(nll_theta, x0=[1, 8])
theta_ml = res.x

* Visualize the likelihood function in 2D together with the ML estimate. Comment the obtained figure. 

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, LL, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Likelihood");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

## 1.2: Maximum A Posteriori Estimation

* Derive an analytical expression of the posterior $f(\theta | y)$, up to a multiplicative factor not depending on $\theta$. 

Hint: exploit the already-obtained likelihood and the functional form of the Gaussian pdf.

$$f(\theta | y) = \frac{P(y | \theta) f(\theta)}{P(y)} \propto \mathcal{L}(\theta)
\exp\left(-\frac{1}{2} \frac{(\alpha - \mu_\alpha)^2}{\sigma^2_\alpha} \right ) 
\exp\left(-\frac{1}{2} \frac{(\beta - \mu_\beta)^2}{\sigma^2_\beta} \right ). $$ 

* Derive an analytical expression of the log-posterior $\log f(\theta | y)$, up to an additive factor not depending on $\theta$.

$$\log f(\theta | y) = \log \frac{P(y | \theta) f(\theta)}{P(y)} = \log P(y | \theta) + \log f(\theta) - \log P(y) = \ell(\theta) - \frac{1}{2} (\theta - \mu)^{\top} \Sigma_0^{-1} (\theta - \mu)^{\top} + \rm{cnst}.$$

* Write the unnormalized posterior and log-posterior (up to a multiplicative/additive factor, respectively) as Python functions.

In [None]:
mu_alpha = 0
sigma_alpha = 2
mu_beta = 10
sigma_beta = 10

prior_alpha = stats.norm(loc=mu_alpha, scale=sigma_alpha)
prior_beta = stats.norm(loc=mu_beta, scale=sigma_beta)

#post_unscaled = lambda theta: lik_theta(theta)*prior_fun.pdf(theta)
#log_post_unscaled = lambda theta: log_lik_theta(theta) + prior_fun.logpdf(theta)

def post_unscaled(alpha, beta):
    lik_val = lik(alpha, beta)
    #return lik_val * prior_alpha.pdf(alpha) * prior_beta.pdf(beta)    
    return lik_val * np.exp(-0.5 * (alpha - mu_alpha)**2/sigma_alpha**2) * \
        np.exp(-0.5 * (beta - mu_beta)**2/sigma_beta**2)

def log_post_unscaled(alpha, beta):
    log_lik_val = log_lik(alpha, beta)
    #return lik_val * prior_alpha.pdf(alpha) * prior_beta.pdf(beta)    
    return log_lik_val -0.5 * (alpha - mu_alpha)**2/sigma_alpha**2 - 0.5 * (beta - mu_beta)**2/sigma_beta**2

* Compute the maximum a posteriore estimate $\alpha^{\rm MAP}, \beta^{\rm MAP}$.

In [None]:
minus_logpost = lambda theta: -log_post_unscaled(theta[0], theta[1])
res = minimize(minus_logpost, x0=[0, 10])
theta_map = res.x

* Visualize the MAP estimate together with the unnormalized posterior in 2D. Comment the results.

In [None]:
#def p_prior_unscaled(alpha, beta):
#    return np.exp(-0.5 * (alpha - mu_alpha)**2/sigma_alpha**2) * \
#        np.exp(-0.5 * (beta - mu_beta)**2/sigma_beta**2)
#
#PP = p_prior_unscaled(AA, BB) # Prior
#POST_UNSC = LL * PP
POST_UNSC = post_unscaled(AA, BB)

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, POST_UNSC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_map[0], theta_map[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Unnormalized posterior");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

## 1.3 Brute-force posterior estimation

* Compute a gridding approximation of the *normalized* posterior, with the correct normalization constant. Explain the passages.

We have:
    $$ \tilde f(\theta | y) = \mathcal{L}(\theta) \exp\left(-\frac{1}{2} 
(\theta - \mu)^{\top} \Sigma_0^{-1} (\theta - \mu)^{\top} \right) = Z f(\theta | y),$$
where $Z$ is the to-be-determined normalization constant and it must be chosen such that:
$$\iint f(\theta | y) d\alpha\; d\beta = 1.$$
Thus,
$$Z = \iint f(\theta | y) d\alpha\; d\beta.$$

The integral above is intractable, but a gridding approximation may be used. Using an equi-spaced gridding, a Riemann Sum approximation is:

$$Z \approx \Delta \alpha \Delta \beta \sum_i f(\theta_i | y),$$

where $\Delta \alpha$ and $\Delta \beta$ are the discretization steps of the 2D grid and $\theta_i$ are the grid points.

In [None]:
dalpha = 0.01
dbeta = 0.01
normalizing_factor = np.sum(POST_UNSC)*dalpha*dbeta
POST_SC = POST_UNSC/normalizing_factor


fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, POST_SC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_map[0], theta_map[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Normalized posterior");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

* Using the grid-based approximation of the posterior, compute the posterior mean of $\alpha$ and $\beta$.

By definition, we have:

$$E[\theta] = \iint \theta p(\theta | y) d\alpha\; d\beta.$$

Using the grid-based approximation above:

$$E[\theta] = \Delta \alpha \Delta \beta \sum \theta_i p(\theta_i | y).$$

Software implementation below

In [None]:
a_mean = np.sum(AA*POST_SC)*dalpha*dbeta
b_mean = np.sum(BB*POST_SC)*dalpha*dbeta
a_mean, b_mean

This is (yet another!) meaningful point estimate of $\theta$. 

## 1.4 Monte-carlo estimation

* Obtain a sample-based approximation of the posterior $f(\theta | y)$ by implementing the Metropolis algorithm from scratch.

In [None]:
def p_ratio_fun(alpha_propose, beta_propose, alpha_previous, beta_previous):
    log_p_previous = log_post_unscaled(alpha_previous, beta_previous)
    log_p_propose = log_post_unscaled(alpha_propose, beta_propose)
    log_p_ratio = log_p_propose - log_p_previous # log(p_prop/p_prev) = log(p_prop) - log(p_prev)
    p_ratio = np.exp(log_p_ratio)
    return p_ratio

In [None]:
p_ratio_fun(alpha_propose = 1.89, alpha_previous = 0.374, beta_propose = 24.76, beta_previous = 20.04)

In [None]:
p_ratio_fun(alpha_propose = 0.374, alpha_previous = 1.89, beta_propose = 20.04, beta_previous = 24.76)

Let us run a Metropolis algorithm to sample from the posterior. The `p_ratio_fun` function is all we need!

In [None]:
N = 100_000 # number of Metropolis steps
alpha_0 = mu_alpha # initial value for alpha
beta_0 = mu_beta # initial value for alpha

alpha_step = alpha_0
beta_step = beta_0
sigma_prop_alpha = 1.0
sigma_prop_beta = 5.0

alphas = []
betas = []
for idx in range(N):
    alphas.append(alpha_step)
    betas.append(beta_step)

    alpha_prop = alpha_step + sigma_prop_alpha * np.random.randn()
    beta_prop = beta_step + sigma_prop_beta * np.random.randn()
  
    p_ratio = p_ratio_fun(alpha_prop, beta_prop, alpha_step, beta_step)
    accept_prob = np.minimum(1.0, p_ratio)
    accept = (np.random.rand() < accept_prob)
    
    if accept:
        alpha_step = alpha_prop
        beta_step = beta_prop

alphas = np.stack(alphas)
betas = np.stack(betas)
thetas = np.c_[alphas, betas]

* Compare the Metropolis samples with the gridding-based approximation of the posterior distribution $f(\theta | y)$ and comment the result.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].hist2d(x=thetas[:, 0], y=thetas[:, 1], bins=100, cmap=plt.cm.BuPu)
ax[0].set_xlim([-4, 10]);
ax[0].set_ylim([-10, 40]);
ax[0].contour(AA, BB, POST_SC); #, levels=[5, 15,  95]); # levels=[5, 15, 25, 35, 45, 55, 65, 75, 85, 95])
c = ax[1].pcolormesh(AA, BB, POST_SC, cmap=cm.coolwarm, shading='auto')

In [None]:
np.mean(thetas[10_000:, :], axis=0)

In [None]:
np.cov(thetas.transpose())

In [None]:
np.sum(AA*POST_SC)*dalpha*dbeta, np.sum(BB*POST_SC)*dalpha*dbeta

In [None]:
plt.plot(thetas[:,0])#px.scatter(thetas[:, 0])

In [None]:
plt.plot(thetas[:, 1])

* Obtain a sample-based approximation of the posterior $f(\theta | y)$ using pymc3.

In [None]:
with pm.Model() as bioassay:
    alpha = pm.Normal("alpha", mu=mu_alpha, sigma=sigma_alpha)
    beta = pm.Normal("beta", mu=mu_beta, sigma=sigma_beta)
    p = pm.Deterministic("p", pm.math.sigmoid(alpha + beta*x))
    y_var = pm.Binomial("y_var", n=n, p=p, observed=y)
    trace=pm.sample(10_000, return_inferencedata=True)

* Comment the results obtained with pymc3 and compare with previous results (gridding and Metropolis from scratch)

In [None]:
with bioassay:
    display(az.summary(trace))

In [None]:
az.plot_trace(trace);

In [None]:
az.plot_posterior(trace, var_names=["alpha", "beta"]);