In [None]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
pip install git+https://github.com/davidbau/baukit > /dev/null

In [None]:
from baukit import show, Range, Numberbox, PlotWidget, set_requires_grad
import torch, numpy

# Generative Modeling: the Classical Density Approach

To make a generative **density** model, you start with a set of data points, like the following six 2D points $\{x_i\}$:

In [None]:
data = torch.tensor([[-0.2997,  0.7500,  2.4024, -3.3377, -1.3939, 1.2348],
                     [ 0.2965,  0.1307, -1.5569,  1.3849,  1.0405, 1.1438]])
def draw_data(fig, data, size=30):
    [ax] = fig.axes
    ax.scatter(data[0,:], data[1,:], s=size, color='red')
    
PlotWidget(draw_data, data=data)

And then you define a family of **probability density** functions to fit the data.  Here we use a 2d multivariate Gaussian model $\mathcal{N}(\Sigma; \mu)$ which defined as:

$$
P(x) = \frac{\exp\left(-\frac{1}{2}(x - \mu)^{T}\Sigma^{-1}(x - \mu)\right)}{\sqrt{(2 \pi)^{d} \det{\Sigma}}} 
$$

Despite the length of the formula, the multivariate Gaussian is very simple and is just an ellipsoid-shaped mound of probability density as below.

It is parameterized by a matrix $\Sigma$ and a vector $\mu$.  If you re-run the cell, different parameters will be chosen:

In [None]:
def draw_density(fig, Sigma, Mean):
    [ax] = fig.axes
    for size in [0.668, 1.01, 1.35, 1.79]:
        angle = torch.linspace(0, 6.3, 100)
        circle = torch.stack([angle.sin(), angle.cos()])
        ellipse = torch.mm(Sigma, circle) * size + Mean[:,None]
        ax.plot(ellipse[0,:], ellipse[1,:], color='skyblue')
PlotWidget(draw_density, Sigma=torch.randn(2, 2), Mean=torch.randn(2))

In [None]:
def draw_sample(fig, Sigma, Mean):
    [ax] = fig.axes
    z = torch.tensor(numpy.random.RandomState(2).randn(2, 500)).float()
    x = Sigma @ z + Mean[:, None]
    ax.scatter(x[0,:], x[1,:], s=20, color='slateblue')
PlotWidget(draw_sample, Sigma=torch.randn(2, 2), Mean=torch.randn(2))

Then to **train** our generative model, we find the parameters $\Sigma$ and $\mu$ that make the observed data most likely. In other words, the predicted probabilities of the observed data should be the highest.

Since each data item $x_i$ is independent, the probability of the whole data set is the product of the probabilities of each one.  We usually tag the negative log of these so we can just add them up.  For our Gaussian model, this negative log likelihood (NLL) for $\{x_i\}$ is:
$$
- \log P[\{x_i\}] = - \log \prod_i P[x_i] = -\sum_i \log P[x_i] = \sum_i \left ( (x_i - \mu)^T\Sigma^{-1}(x_i - \mu)  +  \frac{1}{2} \log \det{\Sigma} + \frac{d}{2} \log (2 \pi) \right )
$$

Again, seems like a lot of math, but it's just one line of code.  Below we define a function to compute the NLL over a bunch of data and graph it.

In [None]:
import math
def gaussian_nll(x, Sigma, Mean):
    x2 = x - Mean[:,None]
    return (x2 * (torch.inverse(Sigma) @ x2)).sum(dim=0) + 0.5 * torch.logdet(Sigma) + math.log(math.pi * 2)

def draw_negative_log_likelihood(fig, A=1.0, B=0.0, C=1.0, X=0.0, Y=0.0, title='Gaussian', sample=False):
    [ax] = fig.axes
    ax.clear(); ax.set_aspect('equal')
    ax.set_xlim(-5, 5); ax.set_ylim(-5, 5)
    Sigma = torch.tensor([[A, B], [B, C]]).float()
    Mean = torch.tensor([X, Y])
    if sample:
        draw_sample(fig, Sigma, Mean)
    draw_data(fig, data)
    draw_density(fig, Sigma, Mean)
    nll = gaussian_nll(data, Sigma, Mean)
    for i in range(data.shape[1]):
        ax.annotate(f'{nll[i].item():.3f}', (data[0,i].item(), data[1,i].item()))
    ax.set_title(f'Negative log likelihood of {data.shape[1]} data points: {nll.sum().item():.3f}')

In the user interface below, we let you control $\Sigma$ and $\mu$ parameters by hand by adjusting sliders for $A$, $B$, $C$, $X$, and $Y$:

$$
\text{parameters } \theta \text{ are: } \qquad
\Sigma = \begin{bmatrix} A & B \\ B & C \end{bmatrix}
\qquad
\mu = \begin{bmatrix} X \\ Y \end{bmatrix}
$$

In [None]:
plot = PlotWidget(draw_negative_log_likelihood, figsize=(8,8))
show([[show.style(textAlign='right'), v,
       show.style(flex=5), Range(value=plot.prop(v), min=-2.0, max=6.0, step=0.01),
       show.style(width=50), Numberbox(value=plot.prop(v))]
          for v in 'ABCXY'] +
     [[plot]])

Can you find a parameter setting that achieves NLL of less than 23?

So this is the standard **density model** setting: