# Posterior inference

In this notebook we will address “learning” in the Bayesian context, which ends meaning updating the distributions of the parameters of a model, by conditioning on observed data.

## Coin fairness

In [None]:
from math import sqrt
import os
import torch
import torch.distributions.constraints as constraints
import matplotlib.pyplot as plt
import pyro
from pyro.optim import Adam
from pyro.infer import SVI, Trace_ELBO
import pyro.distributions as dist
from pyro.infer import Predictive, MCMC, NUTS
import csv
import numpy as np
from src.graphs import coin_graph_multi, coin_graph_plate, coin_graph


Let's return to and extend [the coin-tossing example from the pyro tutorial](http://pyro.ai/examples/svi_part_i.html).
Our goal is to infer the fairness of the coin.

$$\begin{aligned}
\operatorname{fairness} &\sim \operatorname{Beta}(10,10)\\
\operatorname{toss} &\sim \operatorname{Binom}(\operatorname{fairness}).
\end{aligned}$$

In [None]:
pyro.clear_param_store()

# # define the hyperparameters that control the beta prior
alpha0 = 10.0
beta0 = 10.0

def model(toss):
    # sample f from the beta prior
    
    f = pyro.sample("latent_fairness", dist.Beta(torch.tensor(alpha0), torch.tensor(beta0)))
    # loop over the observed data
    pyro.sample("toss", dist.Bernoulli(f), obs=toss)


We can diagram the dependency structure of that model thusly:

In [None]:
coin_graph()

Let's say that I toss the coin one time and it comes up "heads", which we code as $\operatorname{toss}=1.$
Let us use pyro to condition on that single observation to update the distribution of the fairness.
For this purpose we will simulate from the distribution $\operatorname{fairness}|\operatorname{toss}=1$:


In [None]:
# observation
toss = torch.tensor([1.0])
pyro.set_rng_seed(101)

# sampler setup
nuts_kernel = NUTS(model, jit_compile=True)
mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=200)
mcmc.run(toss)

# Plot the prior and posterior distributions
prior_fairness = dist.Beta(alpha0, beta0)
fairness_samples_1 = mcmc.get_samples()['latent_fairness'].cpu().numpy()
plt.hist(fairness_samples_1, bins=np.linspace(0,1,33), density=True, color="red", label = "1 observation posterior", alpha=0.2)
measurements = torch.linspace(0, 1, 1000)
plt.plot(measurements, prior_fairness.log_prob(measurements).exp(), color="blue", label="prior");
plt.show()

I really hope the (red) posterior distribution came out pretty close to the (blue) prior distribution.

Now, let us suppose that we flipped the coin 5 times and got heads, heads, tails, heads, tails. what does inference for that model look like?
First we need to update the model to account for multiple observations,

$$\begin{aligned}
\operatorname{fairness} &\sim \operatorname{Beta}(10,10)\\
\operatorname{toss_i} &\sim \operatorname{Binom}(\operatorname{fairness}), i=1,\dots,5.
\end{aligned}$$

In code, we can write this as:

In [None]:
pyro.clear_param_store()

# # define the hyperparameters that control the beta prior
alpha0 = 10.0
beta0 = 10.0

def model(tosses):
    # sample f from the beta prior
    
    f = pyro.sample("latent_fairness", dist.Beta(torch.tensor(alpha0), torch.tensor(beta0)))
    # loop over the observed data
    for i, toss in enumerate(tosses):
        # observe datapoint i using the bernoulli likelihood
        pyro.sample("toss_{}".format(i), dist.Bernoulli(f), obs=toss)


What does the dependency graph of this look like?

In [None]:
coin_graph_multi(5)

In [None]:
tosses = torch.tensor([1,1,0,1,0], dtype=torch.float)
pyro.set_rng_seed(102)

nuts_kernel = NUTS(model, jit_compile=True)
mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=200)
mcmc.run(tosses)

prior_fairness = dist.Beta(alpha0, beta0)

# Plot the prior and posterior distributions
prior_fairness = dist.Beta(alpha0, beta0)
fairness_samples_5 = mcmc.get_samples()['latent_fairness'].cpu().numpy()
plt.hist(fairness_samples_1, bins=np.linspace(0,1,33), density=True, color="red", label = "1 observation posterior", alpha=0.2)
plt.hist(fairness_samples_5, bins=np.linspace(0,1,33), density=True, color="red", alpha = 0.4, label = "5 observation posterior")
measurements = torch.linspace(0, 1, 1000)
plt.plot(measurements, prior_fairness.log_prob(measurements).exp(), color="blue", label="prior");
plt.show()

We have in fact tossed the coin 1000 times in our new Google-funded robotic coin tossing laboratory, and recorded the data in a CSV file. Lets load that up and print the first few observations:

In [None]:
# load some data from our csv file
tosses = []
with open('coin_tosses.csv', newline='') as f:
    reader = csv.reader(f)
    next(reader) #skip header
    for row in reader:
        tosses.append(int(row[0]))
tosses = torch.tensor(tosses, dtype=torch.float)
print(tosses[:10])


At a thousand observations it starts to feel like using that plate notation might be nice.

In [None]:
pyro.clear_param_store()


def model(tosses):
    # # define the hyperparameters that control the beta prior
    alpha0 = 10.0
    beta0 = 10.0

    # sample f from the beta prior
    f = pyro.sample("latent_fairness", dist.Beta(torch.tensor(alpha0), torch.tensor(beta0)))
    # loop over the observed data
    with pyro.plate("data", tosses.shape[0]):
        pyro.sample("tosses", dist.Bernoulli(f), obs=toss)


In [None]:
coin_graph_plate(1000)

In [None]:
pyro.set_rng_seed(103)
nuts_kernel = NUTS(model, jit_compile=True)
mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=200)
mcmc.run(tosses)

# Plot the prior and posterior distributions
prior_fairness = dist.Beta(alpha0, beta0)
fairness_samples_1000 = mcmc.get_samples()['latent_fairness'].cpu().numpy()
plt.hist(fairness_samples_1, bins=np.linspace(0,1,33), density=True, color="red", label = "1 observation posterior", alpha=0.2)
plt.hist(fairness_samples_5, bins=np.linspace(0,1,33), density=True, color="red", alpha = 0.4, label = "5 observation posterior")
plt.hist(fairness_samples_1000, bins=np.linspace(0,1,33), density=True, color="red", alpha = 0.8, label = "1000 observation posterior")
measurements = torch.linspace(0, 1, 1000)
plt.plot(measurements, prior_fairness.log_prob(measurements).exp(), color="blue", label="prior");
plt.show()

What is this telling us about the distribution of the fairness parameter?

In [None]:
print(f"We now believe the fairness is {latent_fairness_samples.mean():.3f} ± {latent_fairness_samples.std():.3f}")

* http://pyro.ai/examples/bayesian_regression.html
* http://pyro.ai/examples/bayesian_regression_ii.html
* http://pyro.ai/examples/intro_part_i.html
* http://pyro.ai/examples/intro_part_ii.html
* http://pyro.ai/examples/effect_handlers.html
* http://pyro.ai/examples/sir_hmc.html
* https://docs.pyro.ai/en/1.7.0/poutine.html
* http://pyro.ai/examples/mle_map.html

## An actual regression problem

## archived SVI stuff


In [None]:

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
fig.suptitle("Marginal Posterior density - Regression Coefficients", fontsize=16)
for i, ax in enumerate(axs.reshape(-1)):
    site = sites[i]
    sns.distplot(svi_samples[site], ax=ax, label="SVI (DiagNormal)")
    sns.distplot(hmc_samples[site], ax=ax, label="HMC")
    ax.set_title(site)
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right');

In [None]:

# # define how much of our data to use
n_tosses = 1

# clear the param store in case we're in a REPL
pyro.clear_param_store()



def guide(tosses):
    # register the two variational parameters with Pyro
    # - both parameters will have initial value 15.0.
    # - because we invoke constraints.positive, the optimizer
    # will take gradients on the unconstrained parameters
    # (which are related to the constrained parameters by a log)
    alpha_q = pyro.param("alpha_q", torch.tensor(15.0),
                         constraint=constraints.positive)
    beta_q = pyro.param("beta_q", torch.tensor(15.0),
                        constraint=constraints.positive)
    # sample latent_fairness from the distribution Beta(alpha_q, beta_q)
    pyro.sample("latent_fairness", dist.Beta(alpha_q, beta_q))

    
def basic_coin_posterior(model, guide, tosses):
    # setup the optimizer
    n_steps = 200
    adam_params = {"lr": 0.005, "betas": (0.90, 0.999)}
    optimizer = Adam(adam_params)

    # setup the inference algorithm
    svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

    # do gradient steps
    for step in range(n_steps):
        svi.step(tosses)
        if step % 100 == 0:
            print('.', end='')


# # grab the learned variational parameters
alpha_q = pyro.param("alpha_q").item()
beta_q = pyro.param("beta_q").item()

prior_fairness = dist.Beta(alpha0, beta0)
posterior_fairness = dist.Beta(alpha_q, beta_q)
inferred_mean = posterior_fairness.mean.item()
inferred_std = sqrt(posterior_fairness.variance)

print("\nbased on the data and our prior belief, the fairness " +
      "of the coin is %.3f +- %.3f" % (inferred_mean, inferred_std))
lb = 0
ub = 1
resolution = 1000
measurements = torch.linspace(lb, ub, resolution)

plt.plot(measurements, posterior_fairness.log_prob(measurements).exp(), color="blue", label="posterior");
plt.plot(measurements, prior_fairness.log_prob(measurements).exp(), color="red", label="prior");
