# Learning a multimodal distribution

Affine Autoregressive Transforms to learn the transform from a standard normal into a two-moon distribution. The code is light and can be run on a local laptop; no GPUs needed.

- Transforms are based on `pyro`, which closely embraces the `torch.distributions` library.
- `scikit-klearn` two-moon dataset is used.
- `matplotlib` for plotting

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets

import torch
from torch import optim
from torch.distributions import Normal, TransformedDistribution

from pyro.nn import AutoRegressiveNN
from pyro.distributions.transforms import AffineAutoregressive

In [None]:
samples, labels = datasets.make_moons(n_samples=1000, noise=0.1)

In [None]:
plt.scatter(samples.T[0], samples.T[1], color="orange")
plt.title("Two moon distribution")
plt.xlabel("$x$")
plt.ylabel("$y$")

In [None]:
samples = torch.from_numpy(samples).to(dtype=torch.float32)

## Autoregressive Net and Transform

The flow we implement below has affine autoregressive transforms. Most of the constructs are available in the `pyro` API.

In [None]:
input_dim = 2  # data dimension
hidden_dims = [50*input_dim, 50*input_dim, 50*input_dim]

base_dist = Normal(torch.zeros(input_dim), torch.ones(input_dim))

arn = AutoRegressiveNN(input_dim, hidden_dims, param_dims=[1, 1])

In [None]:
arn

In [None]:
# two dimensional input -> mu and sigma (follow from lecture)
arn(torch.ones(1, 2))

In [None]:
transform =  AffineAutoregressive(arn)  # the "affine" part implies the linear relation between hidden dimensions

In [None]:
# the flow implementation is torch transformed distribution
flow_dist = TransformedDistribution(base_dist, [transform])

The `flow_dist` is the normalizing flow: It is a distribution which can be evaluated, and sampled from.

In [None]:
flow_dist.sample([10]) # -> 10 samples

In [None]:
sample_points = torch.tensor(
    [
        [0., 0.],
        [0., 1.],
        [1., 0.],
        [1., 1.],
    ]
)
with torch.no_grad():
    sample_log_prob = flow_dist.log_prob(sample_points)
for sample, log_prob in zip(sample_points, sample_log_prob):
    print(f"log p({sample}) = {log_prob:.3e}")

# Now we train the flow

In [None]:
optimizer = optim.Adam(transform.parameters(), lr=1e-3)

In [None]:
from IPython.display import clear_output
from time import sleep

def live_plot(x_vals, y_vals, iteration, labels=None):
    """Auxiliary function to visualize the distribution"""
    clear_output(wait=True)
    sleep(1)
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    ax.scatter(x_vals, y_vals, label='proxy')
    ax.scatter(samples.T[0], samples.T[1], alpha=0.1, label='Orig.', c=labels)
    ax.legend()
    ax.set_title('iteration {}'.format(iteration))

## Learn the Transform

In [None]:
num_iter = 1000
for i in range(num_iter):

    optimizer.zero_grad()
    # take the original samples, and evaluate the likelihood.
    loss = -flow_dist.log_prob(samples).mean()
    loss.backward()
    optimizer.step()

    flow_dist.clear_cache()  # pyro modules cache values and derivatives for performance

    if (i + 1) % 100 == 0:
        with torch.no_grad():
            samples_flow = flow_dist.sample(torch.Size([1000,])).numpy()
        live_plot(samples_flow[:,0], samples_flow[:,1], i + 1)
        plt.show()

## Compose several transforms

In the previous case we just had a single transform. Now we compose several of those and repeat

In [None]:
transforms = [
    AffineAutoregressive(
        AutoRegressiveNN(
            input_dim, hidden_dims,
            param_dims=[1, 1]
        )
    ) for _ in range(5)
]

In [None]:
flow_dist = TransformedDistribution(base_dist, transforms)

In [None]:
trainable_parameters = []

for t in transforms:
    trainable_parameters.extend(list(t.parameters()))

In [None]:
optimizer = optim.Adam(trainable_parameters, lr=1e-3)

### Learn the transform

In [None]:
num_iter = 5000
for i in range(num_iter):

    optimizer.zero_grad()
    loss = -flow_dist.log_prob(samples).mean()
    loss.backward()
    optimizer.step()
    flow_dist.clear_cache()

    if (i + 1) % 100 == 0:
        with torch.no_grad():
            samples_flow = flow_dist.sample(torch.Size([1000,])).numpy()

        live_plot(samples_flow[:,0], samples_flow[:,1], i + 1)
        plt.xlim((-2.0, 3.0))
        plt.ylim((-1.5, 1.5))
        plt.show()

In [None]:
num_parameters = lambda parameters: sum(p.numel() for p in parameters if p.requires_grad)

In [None]:
print("Trainable parameters of single transform =", num_parameters(transform.parameters()))

In [None]:
print("Trainable parameters of after composing transforms =", num_parameters(trainable_parameters))

# Things to try

- Compare results/number of trainable parameters from other flavors of autoregressive nets: splines, neural autoregressive etc.
- Compare results/number of parameters with coupling layers instead. Note that like affine/spline autoregressive, there is the corresponding affine/spline coupling transforms.
- Depending on whether the masked feed-forward layers are implemented from "data" to "normal" direction or opposite, the flow is called masked-autoregressive or inverse-autoregressive. Look at the `pyro` source code on github and infer which one is the above implementation.

# Sampling from each mode



In [None]:
plt.scatter(samples.T[0], samples.T[1], c=labels)
plt.title("Two colored moon")
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.colorbar()

In [None]:
labels = torch.from_numpy(labels).to(dtype=torch.float32).reshape(samples.shape[0], 1)

Let's learn the conditional approximator based on the color

In [None]:
from pyro.distributions import ConditionalTransformedDistribution
from pyro.nn.auto_reg_nn import ConditionalAutoRegressiveNN
from pyro.distributions.transforms import ConditionalAffineAutoregressive

In [None]:
condition_dim = 1 # the color is either 0 or 1
arn = ConditionalAutoRegressiveNN(input_dim, condition_dim, hidden_dims, param_dims=[1, 1])

In [None]:
arn(torch.ones(5, 2), context=torch.ones(5, 1))  # need to supply additional context

In [None]:
transforms = [
    ConditionalAffineAutoregressive(
        ConditionalAutoRegressiveNN(
            input_dim, condition_dim, hidden_dims,
            param_dims=[1, 1]
        )
    ) for _ in range(5)
]
conditional_flow_dist = ConditionalTransformedDistribution(base_dist, transforms)

In [None]:
trainable_parameters = []

for t in transforms:
    trainable_parameters.extend(list(t.parameters()))

In [None]:
optimizer = optim.Adam(trainable_parameters, lr=1e-3)
num_iter = 1000

for i in range(num_iter):
    optimizer.zero_grad()
    loss = -conditional_flow_dist.condition(labels).log_prob(samples).mean()
    loss.backward()
    optimizer.step()
    conditional_flow_dist.clear_cache()

    if (i + 1) % 100 == 0:
        inference_label = ((i + 1) // 100) % 2  # alternate between modes
        with torch.no_grad():
            samples_one = conditional_flow_dist.condition(
                torch.tensor([1,])
            ).sample(torch.Size([1000,])).numpy()

        live_plot(samples_one[:,0], samples_one[:,1], i + 1, labels=labels)
        plt.xlim((-2.0, 3.0))
        plt.ylim((-1.5, 1.5))
        plt.show()