In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib import cm
import arviz as az
import pymc as pm

# Assignment 1: Toxicity bioassay

## Problem and data

The length of babies follows a square root formula...

In [None]:
np.random.seed(43)

In [None]:
babies = pd.read_csv('babies.csv')#.sample(20)
babies.sample(5)
month_obs = babies["Month"].values
length_obs = babies["Length"].values

In [None]:
fig, ax = plt.subplots()
ax.plot(babies["Month"], babies["Length"], 'C0.', alpha=1.0)
ax.set_ylabel("Length")
ax.set_xlabel("Month");

## Modeling assumptions

For the probabilistic model, we make the following assumptions:

1. The average of the baby length is .

2. The standard deviation:
3. The prior probability of the parameters
4. Probabilities are independent...


In [None]:
alpha = 49.5
sigma_sigma = 10
sigma_beta = 10

## 1.1: Probabilistic model

* Derive and comment the full probabilistic model.

Putting together the probabilistic assumptions 1-3, we obtain:

\begin{align*}
y_i &\sim \mathcal{N}(\alpha + \beta \sqrt M_i, \sigma)\\
\beta &\sim \mathcal{N}(0, \sigma_\beta)\\
\sigma &\sim |\mathcal{N}|(\sigma_\sigma)\\
\end{align*}

Furthermore, according to assumption 4:

$$f(y|\theta) = \prod_i f(y_i|\theta)$$

## 1.2: Maximum Likelihood estimation 


* Derive an analytical expression of the likelihood function $\mathcal{L}(\theta) = f(y|\theta)$.

The likelihood function $\mathcal{L}(\theta)$ is $P(y|\theta)$, seen as a function of $\theta$, with $y$ fixed to the observed outcome. <br/>Since the individual observations $y_i$ are independent, we have:

$$\mathcal{L}(\theta) = f(y|\theta) = \prod_{i=1}^N \frac{1}{\sigma \sqrt{2\pi}} 
e^{-\frac{1}{2} \bigg( \frac{y_i - (\alpha + \beta x_i)}{\sigma} \bigg )^2 }  $$

* Derive an analytical expression of the log-likelihood function $\ell(\theta) = \log P(y|\theta)$.

\begin{align}\ell(\theta) &= \sum_{i=1}^N -log(\sigma) -1/2 \log(2\pi) -\frac{1}{2}\bigg( \frac{y_i - (\alpha + \beta x_i)}{\sigma} \bigg )^2\\
&= -\frac{N}{2} \log (2\pi) -N\log \sigma - \frac{1}{2 \sigma^2}\sum_{i=1}^{N} \big{(} y_i - (\alpha + \beta x_i)  \big{)} ^2
\end{align}

* Write a Python function corresponding to the like-likelihood function $\mathcal{l}(\theta)$. You may ignore additive factors which do not depend on $\beta, \sigma$.

In [None]:
def log_lik(beta, sigma):
    x = np.sqrt(babies["Month"].values)
    y = babies["Length"].values    
    y_mu = alpha + beta*x
    N = x.shape[0]
    log_lik = -N/2*np.log(2*np.pi) - N*np.log(sigma) -1/(2*sigma**2) * np.sum((y - y_mu)**2)
    return log_lik

In [None]:
dbeta = 0.002
dsigma = 0.002

BETA = np.arange(7.1, 7.5, dbeta)
SIGMA = np.arange(2.6, 3.5, dsigma)
BB, SS = np.meshgrid(BETA, SIGMA, indexing='xy')
BBSS = np.stack((BB, SS), axis=-1)

LL = np.empty((SIGMA.shape[0], BETA.shape[0]))
for i in range(SIGMA.shape[0]):
    for j in range(BETA.shape[0]):
        LL[i, j] = log_lik(BETA[j], SIGMA[i])
    
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LL, cmap=cm.coolwarm, shading='auto')

fig.colorbar(c, ax=ax)
ax.set_title(f"Log-likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

In [None]:
from scipy.optimize import minimize

log_lik_theta = lambda theta: log_lik(theta[0], theta[1])
nll_theta = lambda theta: -log_lik_theta(theta) # negative log-likelihood function.
res = minimize(nll_theta, x0=[7, 2])
theta_ml = res.x

In [None]:
theta_ml

* Plot the log-likelihood together with the maximum likelihood estimate

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LL, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Scaled log-likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

* Plot the likelihood function up to a normalization constant

In [None]:
LIK_SC = np.exp(LL - np.max(LL))

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LIK_SC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Scaled likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

## 1.2: Maximum A Posteriori Estimation

* Derive an analytical expression of the posterior $f(\theta | y)$, up to a multiplicative factor not depending on $\theta$. 

Hint: exploit the already-obtained likelihood and the functional form of the Gaussian pdf.

$$f(\theta | y) = \frac{P(y | \theta) f(\theta)}{P(y)} \propto \mathcal{L}(\theta)
\exp\left(-\frac{1}{2} \frac{\beta^2}{\sigma^2_\beta} \right )
\exp\left(-\frac{1}{2} \frac{\sigma^2}{\sigma^2_\sigma} \right ) $$

* Derive an analytical expression of the log-posterior $\log f(\theta | y)$, up to an additive factor not depending on $\theta$.

$$\log f(\theta | y) = \log \frac{P(y | \theta) f(\theta)}{P(y)} = \log P(y | \theta) + \log f(\beta) + \log f(\sigma) - \log P(y) $$

Thus, 

$$\log f(\theta | y) = \ell(\theta) -\frac{1}{2} \frac{\beta^2}{\sigma_\beta^2} -\frac{1}{2} \frac{\sigma^2}{\sigma_\sigma^2} $$

* Write the unnormalized posterior and log-posterior (up to a multiplicative/additive factor, respectively) as Python functions.

In [None]:
def log_post_unscaled(beta, sigma):
    sigma_beta = 10
    sigma_sigma = 10
    log_lik_val = log_lik(beta, sigma)
    return log_lik_val -0.5*beta**2/sigma_beta**2 -0.5* sigma**2/sigma_sigma**2

* Compute the maximum a posteriore estimate $\alpha^{\rm MAP}, \beta^{\rm MAP}$.

In [None]:
minus_logpost = lambda theta: -log_post_unscaled(theta[0], theta[1])
res = minimize(minus_logpost, x0=[7, 2])
theta_map = res.x

In [None]:
theta_map

* Visualize the MAP estimate together with the unnormalized posterior in 2D. Comment the results.

In [None]:
LP_UNSC = np.empty((SIGMA.shape[0], BETA.shape[0]))
for i in range(SIGMA.shape[0]):
    for j in range(BETA.shape[0]):
        LP_UNSC[i, j] = log_post_unscaled(BETA[j], SIGMA[i])

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BETA, SIGMA, LP_UNSC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
plt.plot(theta_map[0], theta_map[1], "ko")
fig.colorbar(c, ax=ax)
ax.set_title(f"Unnormalized posterior");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

In [None]:
np.sum(POST)

## 1.3 Brute-force posterior estimation

* Compute a gridding approximation of the *normalized* posterior, with the correct normalization constant. Explain the passages.

We have:
    $$ \tilde f(\theta | y) = \mathcal{L}(\theta) \exp\left(-\frac{1}{2} 
(\theta - \mu)^{\top} \Sigma_0^{-1} (\theta - \mu)^{\top} \right) = Z f(\theta | y),$$
where $Z$ is the to-be-determined normalization constant and it must be chosen such that:
$$\iint f(\theta | y) d\alpha\; d\beta = 1.$$
Thus,
$$Z = \iint f(\theta | y) d\alpha\; d\beta.$$

The integral above is intractable, but a gridding approximation may be used. Using an equi-spaced gridding, a Riemann Sum approximation is:

$$Z \approx \Delta \alpha \Delta \beta \sum_i f(\theta_i | y),$$

where $\Delta \alpha$ and $\Delta \beta$ are the discretization steps of the 2D grid and $\theta_i$ are the grid points.

In [None]:
from scipy.special import logsumexp
POST_UNSC = np.exp(LP_UNSC - logsumexp(LP_UNSC))
POST_SC = POST_UNSC / (dbeta * dsigma)

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BETA, SIGMA, POST_SC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
plt.plot(theta_map[0], theta_map[1], "ko")
fig.colorbar(c, ax=ax)
ax.set_title(f"Normalized posterior");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

* Using the grid-based approximation of the posterior, compute the posterior mean of $\alpha$ and $\beta$.

By definition, we have:

$$E[\theta] = \iint \theta p(\theta | y) d\alpha\; d\beta.$$

Using the grid-based approximation above:

$$E[\theta] = \Delta \alpha \Delta \beta \sum \theta_i p(\theta_i | y).$$

Software implementation below

In [None]:
beta_mean = np.sum(BB*POST_SC)*dbeta*dsigma
sigma_mean = np.sum(SS*POST_SC)*dbeta*dsigma
beta_mean, sigma_mean

This is (yet another!) meaningful point estimate of $\theta$. 

## 1.4 Monte-carlo estimation

* Obtain a sample-based approximation of the posterior $f(\theta | y)$ by implementing the Metropolis algorithm from scratch.

In [None]:
def p_ratio_fun(beta_propose, sigma_propose, beta_previous, sigma_previous):
    log_p_previous = log_post_unscaled(beta_previous, sigma_previous)
    log_p_propose = log_post_unscaled(beta_propose, sigma_propose)
    log_p_ratio = log_p_propose - log_p_previous # log(p_prop/p_prev) = log(p_prop) - log(p_prev)
    p_ratio = np.exp(log_p_ratio)
    return p_ratio

In [None]:
p_ratio_fun(beta_propose = 1.89, sigma_propose = 0.374, beta_previous = 24.76, sigma_previous = 20.04)

Let us run a Metropolis algorithm to sample from the posterior. The `p_ratio_fun` function is all we need!

In [None]:
N = 100_000 # number of Metropolis steps
beta_0 = 7.0 # initial value for alpha
sigma_0 = 3.0 # initial value for alpha

beta_step = beta_0
sigma_step = sigma_0

sigma_prop_beta = 0.1
sigma_prop_sigma = 0.1

betas = []
sigmas = []
for idx in range(N):
    betas.append(beta_step)
    sigmas.append(sigma_step)

    beta_prop = beta_step + sigma_prop_beta * np.random.randn()
    sigma_prop = sigma_step + sigma_prop_sigma * np.random.randn()
    
    p_ratio = p_ratio_fun(beta_prop, sigma_prop, beta_step, sigma_step)
    accept_prob = np.minimum(1.0, p_ratio)
    accept = (np.random.rand() < accept_prob)
    
    if accept:
        beta_step = beta_prop
        sigma_step = sigma_prop

betas = np.stack(betas)
sigmas = np.stack(sigmas)
thetas = np.c_[betas, sigmas]

In [None]:
plt.scatter(betas, sigmas)