In [None]:
import numpy as np
import scipy.stats as sts
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from scipy.integrate import solve_ivp
import cmdstanpy
import os

import sys
sys.path.append("..")

from stancourse import plots

if os.name == "nt": ## adds compiler to path in Windows
    cmdstanpy.utils.cxx_toolchain_path() 

# Introduction

Contents

* Goals of this Webinar
* Jupyter notebook
* Bayesian Inference
* Markov-Chain Monte Carlo
* Hamiltonian Monte Carlo

## Goals of this Webinar

**Goal 1:** Get some understanding of how Stan works: May help with debugging

**Goal 2:** Introduction to programming with Stan: Simple models, programming techniques

Aspects of immunobiology / viral dynamics / epidemiology models and data
* Non-linear, dynamical models
* Repeated experiments (panel data)

**Goal 3:** ODE models in Stan and multi-threading

## Jupyter notebooks

* Selected cells either have a <span style="color:blue">blue</span> or <span style="color:green">green</span> border
    * <span style="color:blue">blue cells are in "command mode"</span>
    * <span style="color:green">green cells are in "edit mode"</span>
* Execute a selected cell with `shift-enter` or `ctrl-enter`. `shift-enter` will select the next cell. 
* **WARNING** Make sure that the cell is in <span style="color:green">edit mode</span> before typing in the cell
* Switch to edit mode by pressing `enter` or clicking on the cell
* The notebooks contain "code" cells and "markdown" cells
* double click on a "markdown" cell to edit

In [None]:
## code cell: type python code here
print("Welcome to this Stan Webinar")
sts.norm.rvs()

## Bayesian Inference
**Definitions and Bayes Theorem**

Bayesian model "ingredients":

1. <span style="color:blue">Prior distribution $\pi(\theta)$</span> of parameters $\theta \in \Theta$ encodes prior information about model **before we've seen our data**
2. <span style="color:purple">Likelihood function $L(D | \theta)$</span> determines the likelihood of the data $D$, given parameters $\theta$

Bayes theorem gives the <span style="color:red">posterior density</span> of the parameters $\theta$, **given the data $D$**

\begin{equation}
\color{red}{P(\theta | D)} = \frac{\color{blue}{\pi(\theta)} \color{purple}{L(D | \theta)}}{Z(D)}
\end{equation}

* The marginal probability of the data $Z(D) = \int_{\Theta} \pi(\theta) L(D|\theta) d\theta$ is amost always intractable
* Let $Q(\theta | D) = \color{blue}{\pi(\theta)} \color{purple}{L(D | \theta)}$ denote the un-normalized posterior density.


## Bayesian Inference
**Example**

Take $N$ random samples, $K$ are "positive" ($N-K$ are "negative"). Goal: Estimate fraction $\theta$ of positive individuals in the population
* <span style="color:blue">prior distribution $\theta \sim {\rm Beta}(\alpha, \beta)$</span>
* <span style="color:purple">Binomial likelihood function $L(K | p) = \binom{N}{K} \theta^K (1-\theta)^{N-K}$</span>
* Unnormalized posterior
\begin{equation}
Q(\theta|K) = \color{blue}{\pi(\theta)} \color{purple}{L(\theta,K)} = \color{blue}{\frac{\theta^{\alpha-1}(1-\theta)^{\beta-1}}{B(\alpha, \beta)}}  \color{purple}{\binom{N}{K} \theta^K (1-\theta)^{N-K}}
\end{equation}
* Normalizing constant
\begin{equation}
Z(K) = \frac{\binom{N}{K}}{B(\alpha, \beta)} \int_0^1 \theta^{K+\alpha-1} (1-\theta)^{N-K+\beta-1} d\theta = 
\frac{\binom{N}{K} B(K+\alpha, N-K + \beta)}{B(\alpha, \beta)}
\end{equation}
* Hence, the <span style="color:red">posterior distribution</span> of $\theta$ is again given by a Beta distribution
\begin{equation}
\color{red}{\theta | K \sim {\rm Beta}(K + \alpha, N-K + \beta)}
\end{equation}

In [None]:
alpha, beta = 2, 5
N, K = 100, 10

fig, ax = plt.subplots(1, 1, figsize=(5,4))
xs = np.linspace(0, 1, 1000)
ys = sts.beta.pdf(xs, alpha, beta)
zs = sts.beta.pdf(xs, alpha+K, beta+N-K)

ax.fill_between(xs, ys, label="prior $\\pi(\\theta)$", alpha=0.5, color='tab:blue', linewidth=0)
ax.fill_between(xs, zs, label="posterior $P(\\theta | K)$", alpha=0.5, color='tab:red', linewidth=0)

ax.plot(xs, ys, color='tab:blue', linewidth=2)
ax.plot(xs, zs, color='tab:red', linewidth=2)

ax.set_xlabel("$\\theta$")
ax.set_ylabel("$density$")

ax.legend()

## Bayesian Inference 
**Example**

Parameters: $\alpha = 2$, $\beta = 5$, $N = 100$, $K = 10$

In [None]:
fig ## show example figure

## Markov-Chain Monte Carlo
* *Problem:* Normalizing constant $Z(D)$ (and other integrals) is almost always intractable.
* *Solution:* We can generate a random sample from the posterior distribution **without knowing $Z(D)$**.
* Using the random sample $(\theta_i)_{i=1}^N$, we can approximate many statistics, e.g.
$\mathbb{E}[\theta] \approx \frac1N \sum_{i=1}^N \theta_i$

**Metropolis-Hastings algorithm**
1. Start with a sample $\theta_i$ 
2. Sample $\theta_i'$ from proposal distribution $q(\cdot | \theta_i)$ (symmetric: $q(a|b) = q(b|a)$)
3. **Accept** the proposed sample $\theta_i'$ with probability 
\begin{equation}
\min \left\{1, \frac{Q(\theta_i' | D)}{Q(\theta_i | D)}\right\}
\end{equation}
4. Next sample is
\begin{equation}
\theta_{i+1} = \left\{ \begin{array}{ll} \theta_{i}' & \mbox{if accept} \\ \theta_{i} & \mbox{otherwise}  \end{array} \right.
\end{equation}

**Theorem:** The sequence $\theta_1, \theta_2, \dots$ forms a Markov chain with stationary distribution equal to $P(\theta|D)$.

In [None]:
def MHstep(theta, q_rng, q_pdf, Q):
    """
    Take a single Metropolis-Hastings step.
    Start with a sample theta, and generate the next sample
    using the proposal q_rng (to generate random numbers)
    and q_pdf (the density), and the unnormalized posterior 
    density Q. The function returs the next sample, and whether or
    not the parameter was accepted.
    """
    theta_prop = q_rng(theta)
    MH_prob = Q(theta_prop) * q_pdf(theta, theta_prop) / (Q(theta) * q_pdf(theta_prop, theta))
    if sts.uniform.rvs() < MH_prob:
        return theta_prop, True
    return theta, False


def MH_example_figure(Sigma):
    """
    Create a figure to demonstrate the MH algorithm.
    The target density is a multivariate normal distribution
    with covariance matrix Sigma and mean 0.
    """
    sigma = 0.7
    
    ## define proposal and posterior
    def q_rng(theta):
        return theta + sts.norm.rvs(loc=0, scale=sigma, size=2)

    def q_pdf(theta_prop, theta):
        return np.prod(sts.norm.pdf(theta_prop, loc=theta, scale=sigma))

    def Q(theta):
        return sts.multivariate_normal.pdf(theta, cov=Sigma)

    theta = np.zeros(2)
    N = 1000
    thetas = [theta]
    ar = 0 ## used to compute the fraction of accepted samples
    
    ## repeatedly take a Metropolis-Hastings step.
    for i in range(N):
        theta, acc = MHstep(theta, q_rng, q_pdf, Q)
        thetas.append(theta)
        if acc:
            ar += 1
    
    ## make a figure of the results
    fig = plt.figure(figsize=(7,3.5))
    gs = GridSpec(2,2)
    ax1 = fig.add_subplot(gs[1,0])
    ax2 = fig.add_subplot(gs[0,0], sharex=ax1)
    bx = fig.add_subplot(gs[:,1])

    ax1.plot([x[0] for x in thetas], color='k')
    ax2.plot([x[1] for x in thetas], color='k')
    
    ax1.set_ylabel("$\\theta^1$")
    ax2.set_ylabel("$\\theta^2$")
    ax1.set_xlabel("iteration")

    bx.plot([x[0] for x in thetas], [x[1] for x in thetas],
            color='tab:red', linewidth=0.5, zorder=2)
    bx.scatter([x[0] for x in thetas], [x[1] for x in thetas],
               color='k', s=1,zorder=3)

    plots.plot_cov_ellipse(bx, Sigma, np.zeros(2), nstd=2, alpha=0.3, zorder=1)
    plots.plot_cov_ellipse(bx, Sigma, np.zeros(2), nstd=1, alpha=0.5, zorder=1)

    w = 2.5
    bx.set_xlim(-w,w)
    bx.set_ylim(-w,w)
    
    bx.set_xlabel("$\\theta^1$")
    bx.set_ylabel("$\\theta^2$")
    
    ## return the figure, the samples and the fraction of accepted samples
    return fig, thetas, ar/N


## Markov-Chain Monte Carlo
**Metropolis Hastings Example**

* Multivariate-Normal posterior density $\theta | D \sim \mathcal{N}_2(0, \Sigma)$
* Normal proposal $\theta_i' \sim \mathcal{N}_2(\theta_i, \sigma^2 I_2)$
* chain length $N = 1000$

### Uncorrelated parameters

$$\Sigma = \left(\begin{array}{cc} 1 & 0 \\ 0 & 1 \end{array} \right)$$

In [None]:
Sigma = np.array([[1, 0], [0, 1]])
fig, thetas, ar = MH_example_figure(Sigma)
print("acceptance ratio:", ar)

### Correlated parameters
$$\Sigma = \left(\begin{array}{cc} 1 & 0.9 \\ 0.9 & 1 \end{array} \right)$$

In [None]:
Sigma = np.array([[1, 0.9], [0.9, 1]])
fig, thetas, ar = MH_example_figure(Sigma)
print("acceptance ratio:", ar)

Auto-correlated traces $\implies$ small *effective* sample size $\implies$ large Monte-Carlo error

## Hamiltonian Monte-Carlo

* Essentially Metropolis-Mastings with a very fancy proposal
* Extend parameters $\theta \in \mathbb{R}^n$ with auxiliary parameter vector $p \in \mathbb{R}^n$ (called the "conjugate momentum")
* Posterior density of $p$ is mulitvariate normal $p \sim \mathcal{N}_n(0, M)$. 
* Negative log-posterior joint density of $(\theta, p)$ is called the "Hamiltonian"
\begin{equation}
 \mathcal{H}(\theta, p) = \color{red}{-\log(Q(\theta | D))} + \color{blue}{\frac{1}{2} p^T M^{-1} p} + {\rm constant}
\end{equation}
* Interpretation: <span style="color:red">potential energy</span> + <span style="color:blue">kinetic energy</span>. Derive Hamilton's equations
\begin{equation}
\begin{split}
 \frac{d \theta}{d t} &= \frac{\partial \mathcal{H}}{\partial p} \\
 \frac{d p}{d t} &= -\frac{\partial \mathcal{H}}{\partial \theta}
\end{split}
\end{equation}

## Hamiltonian Monte-Carlo

**Algorithm:**

1. Start with state $(\theta_i, p_i)$, sample $p_0 \sim \mathcal{N}_n(0, M)$ directly from marginal posterior
2. Solve Hamiltonian initial value problem with $\theta(0) = \theta_i$ and $p(0) = p_0$ on the time interval $[0,T]$
3. Set proposal $\theta_i' = \theta(T)$ and $p_i' = -p(T)$ (minus sign makes it symmetric)
4. **Accept** proposed state with probability
\begin{equation}
 \min\left\{1, \exp\left(-\mathcal{H}(\theta_i', p_i') + \mathcal{H}(\theta_i, p_0)\right) \right\}
\end{equation}
5. Next state is
\begin{equation}
(\theta_{i+1}, p_{i+1}) = \left\{ \begin{array}{ll} (\theta_{i}', p_i') & \mbox{if accept} \\ (\theta_{i}, p_i) & \mbox{otherwise}  \end{array} \right.
\end{equation}

## Hamiltonian Monte-Carlo

**Why does this work?**

* The $(\theta_i, p_0) \mapsto (\theta(T), p(T))$ is non-linear, but **preserves volume**, so we don't have to add a **"Jacobian correction"**.
* Because of **conservation of energy** ($\frac{d}{dt}\mathcal{H} = 0$), we should accept *all* proposed states.
* To get the posterior distribution of $\theta$, we just marginalize out $p$

See e.g. [Neal, *MCMC using Hamiltonian Dynamics*](https://arxiv.org/pdf/1206.1901.pdf) for many more details

**How does this actually work?**

* We have to know the gradient of $\mathcal{H}$, and hence of $\log(Q)$. Solution: automatic differentiation
* We don't want to integrate a system of ODEs with high accuracy for every Markov transition. Solution: leapfrog integration algorithm with step size $\epsilon$. Introduces numerical error, $\implies$ <strike>conservation of energy</strike>, and therefore the requirement of the MH step.
* Stan automatically sets algorithmic parameters (such as $M$, $\epsilon$) during warmup phase of the chain.

## Hamiltonan Monte-Carlo
**why does this work so well?**

In [None]:
def can_sys(t, y, Sigma):
    x, p = y[:2], y[2:]
    dx = p
    dp = -np.linalg.solve(Sigma, x)
    return np.concatenate([dx, dp])

x0 = np.array([1,1])
Sigma1 = np.array([[1.0,0.0],[0.0,1.0]])
Sigma2 = np.array([[1.0,0.9],[0.9,1.0]])

fig, axs = plt.subplots(1, 2, figsize=(10,5))

for ax, Sigma in zip(axs, [Sigma1, Sigma2]):
    ax.scatter(*x0, color='tab:green', zorder=3, label="$\\theta(0)$")
    for i in range(10):
        p0 = sts.norm.rvs(0, 1, size=2)
        y0 = np.concatenate([x0, p0])
        t_span = (0, 2.5)
        t_eval = np.linspace(*t_span, 1000)
    
        sol = solve_ivp(lambda t,y: can_sys(t,y,Sigma), t_span, y0, t_eval=t_eval)

        ax.plot(sol.y[0], sol.y[1], color='k', zorder=2)
        lab = "$\\theta(T)$" if i == 0 else None
        ax.scatter(sol.y[0,-1], sol.y[1,-1], color='r', zorder=3, label=lab)
    plots.plot_cov_ellipse(ax, Sigma, np.zeros(2), nstd=2, alpha=0.3, zorder=1)
    plots.plot_cov_ellipse(ax, Sigma, np.zeros(2), nstd=1, alpha=0.5, zorder=1)

w = 2.5
for ax in axs:
    ax.set_xlim(-w,w)
    ax.set_ylim(-w,w)
    ax.legend()

In [None]:
fig ## show HMC figure

## Hamiltonan Monte-Carlo
**Distributions with variable correlation**

\begin{equation} 
\theta = (x,y) = (r \cos(\phi), r\sin(\phi)),\quad (r, \phi) \sim \mathcal{N}_2((1,0), \Sigma)
\end{equation}

In [None]:
sm = cmdstanpy.CmdStanModel(stan_file="../stan-models/circular_density.stan")

r = 1
mu = np.array([r,0])

wsq = 0.01
hsq = 0.8

h = np.sqrt(hsq)
w = np.sqrt(wsq)

Sigma = np.array([[wsq, 0], [0, hsq]])

data_dict = {
    "mu" : mu,
    "sigma" : Sigma
}

sam = sm.sample(chains=1, data=data_dict, iter_sampling=5000, thin=5)

def moondist(z, Sigma, mu):
    x, y = z
    r = np.sqrt(np.dot(z,z))
    phi = np.arctan2(y, x)
    u = np.array([r,phi])
    return sts.multivariate_normal.pdf(u, mu, Sigma) / r    

def gradmoondist(z, Sigma, mu):
    x, y = z
    r = np.sqrt(np.dot(z, z))
    phi = np.arctan2(y, x)
    u = np.array([r, phi])
    J = np.array([[x/r, -y/r**2], [y/r, x/r**2]])
    c = np.array([x/r**2, y/r**2])        
    return - np.dot(J, np.linalg.solve(Sigma, u-mu)) - c

def can_sys(t, y, Sigma, mu):
    x, p = y[:2], y[2:]
    dx = p
    dp = gradmoondist(x, Sigma, mu)
    return np.concatenate([dx, dp])

x0 = np.array([1, 0])

fig, axs = plt.subplots(1, 2, figsize=(10,5))

axs[0].scatter(*x0, color='tab:green', zorder=4, label="$\\theta(0)$")
for i in range(10):
    p0 = sts.norm.rvs(0, 1, size=2)
    y0 = np.concatenate([x0, p0])
    t_span = (0, 2)
    t_eval = np.linspace(*t_span, 1000)
    
    sol = solve_ivp(lambda t,y: can_sys(t,y, Sigma, mu), 
                    t_span, y0, t_eval=t_eval)

    axs[0].plot(sol.y[0], sol.y[1], color='k', zorder=2)
    lab = "$\\theta(T)$" if i == 0 else None
    axs[0].scatter(sol.y[0,-1], sol.y[1,-1], color='r', zorder=3, label=lab)

xs = np.linspace(-2, 2, 100)
ys = np.linspace(-2, 2, 100)
Ls = [[moondist(np.array([x,y]), Sigma, mu) for x in xs] for y in ys]  
lLs = np.log(Ls)
    
m = np.max(lLs)
axs[0].contourf(xs, ys, lLs, [m-2,m], colors=['tab:blue'], alpha=0.3)
axs[0].contourf(xs, ys, lLs, [m-1,m], colors=['tab:blue'], alpha=0.3)

axs[0].legend()


## plot Stan sample

xs = sam.stan_variable("x")

axs[1].scatter(xs[:,0], xs[:,1], s=2, color='k', zorder=3)
axs[1].plot(xs[:,0], xs[:,1], zorder=2, color='r', linewidth=0.5)

for ax in axs:
    k = 1.2
    ax.set_xlim(-k, k)
    ax.set_ylim(-k, k)


In [None]:
fig ## show moon-shaped distribution example

NUTS: **no-U-turn sampler**. Avoid "going around in circles".