In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib import cm
import arviz as az
import pymc as pm

# Assignment 1: Toxicity bioassay

## Problem and data

The length of babies follows a square root formula...

In [None]:
babies = pd.read_csv('babies.csv')
babies.sample(5)
month_obs = babies["Month"].values
length_obs = babies["Length"].values

## EDA

In [None]:
#babies.groupby("Month").mean()

In [None]:
fig, ax = plt.subplots()
ax.plot(babies["Month"], babies["Length"], 'C0.', alpha=0.1)
ax.set_ylabel("Length")
ax.set_xlabel("Month");

In [None]:
with pm.Model() as model_baby_linear_full:
    alpha = pm.Normal('alpha', sigma=10)
    beta = pm.Normal('beta', sigma=10)
    
    # Use dot product instead of expanded multiplication
    length_mean = pm.Deterministic("length_mean", alpha  + beta * np.sqrt(babies["Month"]))
    #μ = pm.Deterministic("μ", pm.math.dot(babies[["Intercept", "Month"]], β))
    sigma = pm.HalfNormal("sigma", sigma=10)

    length = pm.Normal("length", mu=length_mean, sigma=sigma, observed=babies["Length"])
    #prior_check = pm.sample_prior_predictive(samples=50, random_seed=42)
    inf_data_full = pm.sample(draws=2000, tune=4000)

In [None]:
inf_data_full_flat = inf_data_full.posterior.stack(sample=("chain", "draw"))

In [None]:
plt.scatter(inf_data_full_flat["alpha"], inf_data_full_flat["beta"])
plt.xlabel("alpha")
plt.ylabel("beta")

In [None]:
with pm.Model() as model_baby_linear:
    alpha = 49.5 #pm.Normal('alpha', sigma=10)
    beta = pm.Normal('beta', sigma=10)
    
    # Use dot product instead of expanded multiplication
    length_mean = pm.Deterministic("length_mean", alpha  + beta * np.sqrt(babies["Month"]))
    #μ = pm.Deterministic("μ", pm.math.dot(babies[["Intercept", "Month"]], β))
    sigma = pm.HalfNormal("sigma", sigma=10)

    length = pm.Normal("length", mu=length_mean, sigma=sigma, observed=babies["Length"])
    #prior_check = pm.sample_prior_predictive(samples=50, random_seed=42)
    inf_data = pm.sample(draws=2000, tune=4000)

In [None]:
az.summary(inf_data)

In [None]:
inf_data_flat = inf_data.posterior.stack(sample=("chain", "draw"))

In [None]:
plt.scatter(inf_data_flat["beta"], inf_data_flat["sigma"])
plt.xlabel("beta")
plt.ylabel("sigma");

## Modeling assumptions

For the probabilistic model, we make the following assumptions:

1. The average of the baby length is .

2. The standard deviation:
3. The prior probability of the parameters
4. Probabilities are independent...


## 1.1: Probabilistic model

* Derive and comment the full probabilistic model.

Putting together the probabilistic assumptions 1-3, we obtain:

\begin{align*}
y_i &\sim \mathcal{N}(\alpha + \beta \sqrt M_i, \sigma)\\
\beta &\sim \mathcal{N}(0, 10)\\
\sigma &\sim |\mathcal{N}|(10)\\
\end{align*}

Furthermore, according to assumption 4:

$$f(y|\theta) = \prod_i f(y_i|\theta)$$

## 1.2: Maximum Likelihood estimation 


* Derive an analytical expression of the likelihood function $\mathcal{L}(\theta) = f(y|\theta)$.

The likelihood function $\mathcal{L}(\theta)$ is $P(y|\theta)$, seen as a function of $\theta$, with $y$ fixed to the observed outcome. <br/>Since the individual observations $y_i$ are independent, we have:

$$\mathcal{L}(\theta) = f(y|\theta) = \prod_{i=1}^N \frac{1}{\sigma \sqrt{2\pi}} 
e^{-\frac{1}{2} \bigg( \frac{y_i - (\alpha + \beta x_i)}{\sigma} \bigg )^2 }  $$

* Derive an analytical expression of the log-likelihood function $\ell(\theta) = \log P(y|\theta)$.

\begin{align}\ell(\theta) &= \sum_{i=1}^N -log(\sigma) -1/2 \log(2\pi) -\frac{1}{2}\bigg( \frac{y_i - (\alpha + \beta x_i)}{\sigma} \bigg )^2\\
&= -\frac{N}{2} \log (2\pi) -N\log \sigma - \frac{1}{2 \sigma^2}\sum_{i=1}^{N} \big{(} y_i - (\alpha + \beta x_i)  \big{)} ^2
\end{align}

* Write a Python function corresponding to the likelihood function $\mathcal{L}(\theta)$. Ignore multiplicative factors which do not depend on $\theta$.

In [None]:
def lik(beta, sigma):
    pass
    # ... TODO

In [None]:
# Likelihood of the 4 observations (neglecting the multiplicative factor).
# The overall likelihood is the product of all terms.

def lik(beta, sigma):
    beta = np.atleast_1d(beta)[..., np.newaxis]
    sigma = np.atleast_1d(sigma)[..., np.newaxis]
    m_sqrt = np.sqrt(babies["Month"].values)
    y = babies["Length"].values

    # add the spatial channels to the data
    for idx in range(beta.ndim-1):
        m_sqrt = m_sqrt[np.newaxis, :]
        y = y[np.newaxis, :]
    
    y_mu = alpha + beta*m_sqrt
    resid = y - y_mu
    lik = 1/(sigma*np.sqrt(2*np.pi)) * np.exp(-1/2*(resid/sigma)**2)
    lik = np.prod(lik, axis=-1)
    return lik

* Visualize the likelihood function in 2D and comment the obtained figure. 

   Hints:
    * you may use the `pcolormesh` function of `matplotlib`
    * appropriate ranges for $\alpha$ and $\beta$ are $[-4, 8]$ and $[-10, 40]$, respectively
    * an appropriate step size for both $\alpha$ and $\beta$ is 0.01

In [None]:
dbeta = 0.01
dsigma = 0.01

BETA = np.arange(5, 10, dbeta)
SIGMA = np.arange(2, 4, dsigma)
BB, SS = np.meshgrid(BETA, SIGMA, indexing='xy')
BBSS = np.stack((BB, SS), axis=-1)
LL = lik(BB, SS)

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LL, cmap=cm.coolwarm, shading='auto')
#plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Likelihood");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

In [None]:
def log_lik(beta, sigma):
    x = np.sqrt(babies["Month"].values)
    y = babies["Length"].values    
    y_mu = alpha + beta*x
    N = x.shape[0]
    log_lik = -N/2*np.log(2*np.pi) - N*np.log(sigma) -1/(2*sigma**2) * np.sum((y - y_mu)**2)
    return log_lik

In [None]:
def log_lik_vec(beta, sigma):
    # Vectorized version of the previous
    beta = np.atleast_1d(beta)[..., np.newaxis]
    sigma = np.atleast_1d(sigma)[..., np.newaxis]
    m_sqrt = np.sqrt(babies["Month"].values)
    y = babies["Length"].values

    # add the spatial channels to the data
    for idx in range(beta.ndim-1):
        m_sqrt = m_sqrt[np.newaxis, :]
        y = y[np.newaxis, :]
    
    y_mu = alpha + beta*m_sqrt
    resid = y - y_mu
    log_lik =  - 1/(2*sigma**2)* resid**2 -np.log(sigma*np.sqrt(2*np.pi))
    log_lik = np.sum(log_lik, axis=-1)
    return log_lik

In [None]:
dbeta = 0.002
dsigma = 0.002

BETA = np.arange(7.1, 7.5, dbeta)
SIGMA = np.arange(2.6, 3.5, dsigma)
BB, SS = np.meshgrid(BETA, SIGMA, indexing='xy')
BBSS = np.stack((BB, SS), axis=-1)

LL = np.empty((SIGMA.shape[0], BETA.shape[0]))
for i in range(SIGMA.shape[0]):
    for j in range(BETA.shape[0]):
        LL[i, j] = log_lik(BETA[j], SIGMA[i])
    
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LL, cmap=cm.coolwarm, shading='auto')

fig.colorbar(c, ax=ax)
ax.set_title(f"Log-likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

In [None]:
from scipy.optimize import minimize

log_lik_theta = lambda theta: log_lik(theta[0], theta[1])
nll_theta = lambda theta: -log_lik_theta(theta) # negative log-likelihood function.
res = minimize(nll_theta, x0=[7, 2])
theta_ml = res.x

In [None]:
theta_ml

* Plot the log-likelihood together with the maximum likelihood estimate

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LL, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Scaled likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

* Plot the likelihood function up to a normalization constant

In [None]:
LIK_SC = np.exp(LL - np.max(LL))

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BB, SS, LIK_SC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_ml[0], theta_ml[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Scaled log-likelihood");
ax.set_xlabel(r"$\beta$");
ax.set_ylabel(r"$\sigma$");

## 1.2: Maximum A Posteriori Estimation

* Derive an analytical expression of the posterior $f(\theta | y)$, up to a multiplicative factor not depending on $\theta$. 

Hint: exploit the already-obtained likelihood and the functional form of the Gaussian pdf.

$$f(\theta | y) = \frac{P(y | \theta) f(\theta)}{P(y)} \propto \mathcal{L}(\theta) p(\beta) p(\sigma) $$

Thus, 

$$f(\theta | y) = K P(y|\theta) = \prod_{i=1}^N \frac{1}{\sigma \sqrt{2\pi}} 
e^{-\frac{1}{2} \bigg( \frac{y_i - (\alpha + \beta x_i)}{\sigma} \bigg )^2 }  $$

* Derive an analytical expression of the log-posterior $\log f(\theta | y)$, up to an additive factor not depending on $\theta$.

$$\log f(\theta | y) = \log \frac{P(y | \theta) f(\theta)}{P(y)} = \log P(y | \theta) + \log f(\beta) + \log f(\sigma) - \log P(y) $$

Thus, 

$$\log f(\theta | y) = \ell(\theta) -\frac{1}{2} \frac{\beta^2}{100} -\frac{1}{2} \frac{\sigma^2}{100} $$

* Write the unnormalized posterior and log-posterior (up to a multiplicative/additive factor, respectively) as Python functions.

In [None]:
def log_post_unscaled(beta, sigma):
    log_lik_val = log_lik(beta, sigma)
    return log_lik_val -0.5*beta**2/100 -0.5* sigma**2/100

* Compute the maximum a posteriore estimate $\alpha^{\rm MAP}, \beta^{\rm MAP}$.

In [None]:
minus_logpost = lambda theta: -log_post_unscaled(theta[0], theta[1])
res = minimize(minus_logpost, x0=[7, 2])
theta_map = res.x

In [None]:
theta_map

* Visualize the MAP estimate together with the unnormalized posterior in 2D. Comment the results.

In [None]:
LP_UNSC = np.empty((SIGMA.shape[0], BETA.shape[0]))
for i in range(SIGMA.shape[0]):
    for j in range(BETA.shape[0]):
        LP_UNSC[i, j] = log_post_unscaled(BETA[j], SIGMA[i])

fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(BETA, SIGMA, POST_UNSC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_map[0], theta_map[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Unnormalized posterior");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

## 1.3 Brute-force posterior estimation

* Compute a gridding approximation of the *normalized* posterior, with the correct normalization constant. Explain the passages.

We have:
    $$ \tilde f(\theta | y) = \mathcal{L}(\theta) \exp\left(-\frac{1}{2} 
(\theta - \mu)^{\top} \Sigma_0^{-1} (\theta - \mu)^{\top} \right) = Z f(\theta | y),$$
where $Z$ is the to-be-determined normalization constant and it must be chosen such that:
$$\iint f(\theta | y) d\alpha\; d\beta = 1.$$
Thus,
$$Z = \iint f(\theta | y) d\alpha\; d\beta.$$

The integral above is intractable, but a gridding approximation may be used. Using an equi-spaced gridding, a Riemann Sum approximation is:

$$Z \approx \Delta \alpha \Delta \beta \sum_i f(\theta_i | y),$$

where $\Delta \alpha$ and $\Delta \beta$ are the discretization steps of the 2D grid and $\theta_i$ are the grid points.

In [None]:
dalpha = 0.01
dbeta = 0.01
normalizing_factor = np.sum(POST_UNSC)*dalpha*dbeta
POST_SC = POST_UNSC/normalizing_factor


fig, ax = plt.subplots(figsize=(10, 6))
c = ax.pcolormesh(AA, BB, POST_SC, cmap=cm.coolwarm, shading='auto')
plt.plot(theta_map[0], theta_map[1], "kx")
fig.colorbar(c, ax=ax)
ax.set_title(f"Normalized posterior");
ax.set_xlabel(r"$\alpha$");
ax.set_ylabel(r"$\beta$");

* Using the grid-based approximation of the posterior, compute the posterior mean of $\alpha$ and $\beta$.

By definition, we have:

$$E[\theta] = \iint \theta p(\theta | y) d\alpha\; d\beta.$$

Using the grid-based approximation above:

$$E[\theta] = \Delta \alpha \Delta \beta \sum \theta_i p(\theta_i | y).$$

Software implementation below

In [None]:
a_mean = np.sum(AA*POST_SC)*dalpha*dbeta
b_mean = np.sum(BB*POST_SC)*dalpha*dbeta
a_mean, b_mean

This is (yet another!) meaningful point estimate of $\theta$. 

## 1.4 Monte-carlo estimation

* Obtain a sample-based approximation of the posterior $f(\theta | y)$ by implementing the Metropolis algorithm from scratch.

In [None]:
def p_ratio_fun(alpha_propose, beta_propose, alpha_previous, beta_previous):
    log_p_previous = log_post_unscaled(alpha_previous, beta_previous)
    log_p_propose = log_post_unscaled(alpha_propose, beta_propose)
    log_p_ratio = log_p_propose - log_p_previous # log(p_prop/p_prev) = log(p_prop) - log(p_prev)
    p_ratio = np.exp(log_p_ratio)
    return p_ratio

In [None]:
p_ratio_fun(alpha_propose = 1.89, alpha_previous = 0.374, beta_propose = 24.76, beta_previous = 20.04)

In [None]:
p_ratio_fun(alpha_propose = 0.374, alpha_previous = 1.89, beta_propose = 20.04, beta_previous = 24.76)

Let us run a Metropolis algorithm to sample from the posterior. The `p_ratio_fun` function is all we need!

In [None]:
N = 100_000 # number of Metropolis steps
alpha_0 = mu_alpha # initial value for alpha
beta_0 = mu_beta # initial value for alpha

alpha_step = alpha_0
beta_step = beta_0
sigma_prop_alpha = 1.0
sigma_prop_beta = 5.0

alphas = []
betas = []
for idx in range(N):
    alphas.append(alpha_step)
    betas.append(beta_step)

    alpha_prop = alpha_step + sigma_prop_alpha * np.random.randn()
    beta_prop = beta_step + sigma_prop_beta * np.random.randn()
  
    p_ratio = p_ratio_fun(alpha_prop, beta_prop, alpha_step, beta_step)
    accept_prob = np.minimum(1.0, p_ratio)
    accept = (np.random.rand() < accept_prob)
    
    if accept:
        alpha_step = alpha_prop
        beta_step = beta_prop

alphas = np.stack(alphas)
betas = np.stack(betas)
thetas = np.c_[alphas, betas]

* Compare the Metropolis samples with the gridding-based approximation of the posterior distribution $f(\theta | y)$ and comment the result.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].hist2d(x=thetas[:, 0], y=thetas[:, 1], bins=100, cmap=plt.cm.BuPu)
ax[0].set_xlim([-4, 10]);
ax[0].set_ylim([-10, 40]);
ax[0].contour(AA, BB, POST_SC); #, levels=[5, 15,  95]); # levels=[5, 15, 25, 35, 45, 55, 65, 75, 85, 95])
c = ax[1].pcolormesh(AA, BB, POST_SC, cmap=cm.coolwarm, shading='auto')

In [None]:
np.mean(thetas[10_000:, :], axis=0)

In [None]:
np.cov(thetas.transpose())

In [None]:
np.sum(AA*POST_SC)*dalpha*dbeta, np.sum(BB*POST_SC)*dalpha*dbeta

In [None]:
plt.plot(thetas[:,0])#px.scatter(thetas[:, 0])

In [None]:
plt.plot(thetas[:, 1])

* Obtain a sample-based approximation of the posterior $f(\theta | y)$ using pymc3.

In [None]:
with pm.Model() as bioassay:
    alpha = pm.Normal("alpha", mu=mu_alpha, sigma=sigma_alpha)
    beta = pm.Normal("beta", mu=mu_beta, sigma=sigma_beta)
    p = pm.Deterministic("p", pm.math.sigmoid(alpha + beta*x))
    y_var = pm.Binomial("y_var", n=n, p=p, observed=y)
    trace=pm.sample(10_000)

* Comment the results obtained with pymc3 and compare with previous results (gridding and Metropolis from scratch)

In [None]:
with bioassay:
    display(az.summary(trace))

In [None]:
az.plot_trace(trace);

In [None]:
az.plot_posterior(trace, var_names=["alpha", "beta"]);