In [1]:
%set_env CUDA_VISIBLE_DEVICES=1

from gpytorch.models.pyro_deep_gp import AbstractPyroHiddenGPLayer, AbstractPyroDeepGP
from gpytorch.variational import VariationalStrategy, CholeskyVariationalDistribution
from gpytorch.kernels import MaternKernel, ScaleKernel, RBFKernel
from gpytorch.means import ConstantMean
from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood
import math

import torch
import numpy as np

from scipy.cluster.vq import kmeans2

from pyro.infer import SVI, TraceMeanField_ELBO, Trace_ELBO
from pyro import optim
import pyro

from torch.utils.data import TensorDataset, DataLoader


env: CUDA_VISIBLE_DEVICES=1


In [2]:

import bayesian_benchmarks
from bayesian_benchmarks.data import get_regression_data



class ToyHiddenGPLayer(AbstractPyroHiddenGPLayer):
    def __init__(self, input_dims, output_dims, name="", inducing_points=50):
        if type(inducing_points) == int:
            inducing_points = torch.randn(output_dims, inducing_points, input_dims)

        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(-2),
            batch_size=output_dims
        )

        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )

        super().__init__(variational_strategy, input_dims, output_dims, name)

        batch_shape = torch.Size([output_dims])

        self.mean_module = ConstantMean(batch_shape=batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(nu=2.5, batch_shape=batch_shape, ard_num_dims=input_dims),
            batch_shape=batch_shape
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)


In [3]:

#TODO: Double inheritance
class ToyDeepGP(AbstractPyroDeepGP):
    def __init__(self, input_dims, output_dims, total_num_data, hidden_gp_layers, likelihood, name="", inducing_points=50):
        inducing_points = torch.randn(output_dims, inducing_points, input_dims)

        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(-2),
            batch_size=output_dims
        )

        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )

        super().__init__(
            variational_strategy,
            input_dims,
            output_dims,
            total_num_data,
            hidden_gp_layers,
            likelihood,
            name
        )

        batch_shape = torch.Size([output_dims])

        self.mean_module = ConstantMean(batch_shape=batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(nu=2.5, batch_shape=batch_shape, ard_num_dims=input_dims),
            batch_shape=batch_shape
        )


    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

In [4]:

dataset='wilson_elevators'
dataset = get_regression_data(dataset)
N_train = dataset.X_train.shape[0]
N_test = dataset.X_test.shape[0]
D_X = dataset.D + 1
train_x, train_y = torch.tensor(dataset.X_train).float().cuda(), torch.tensor(dataset.Y_train[:, 0]).float().cuda()
test_x, test_y = torch.tensor(dataset.X_test).float().cuda(), torch.tensor(dataset.Y_test[:, 0]).float().cuda()
print("N_train = %d   N_test = %d" % (N_train, N_test))

pyro.set_rng_seed(0)
torch.manual_seed(0)

hidden_layer_width = 2
num_inducing = 50
inducing_points = (train_x[torch.randperm(N_train)[0:num_inducing], :])
inducing_points = inducing_points.clone().data.cpu().numpy()
inducing_points = torch.tensor(kmeans2(train_x.data.cpu().numpy(), inducing_points, minit='matrix')[0])
inducing_points = inducing_points.unsqueeze(0).expand((hidden_layer_width,) + inducing_points.shape)
inducing_points = inducing_points.to(device=train_x.device, dtype=train_x.dtype)

print(train_x.device, test_x.device)

N_train = 14939   N_test = 1660
cuda:0 cuda:0


In [5]:
likelihood = GaussianLikelihood().cuda()

hidden_gp = ToyHiddenGPLayer(
    train_x.size(-1),
    hidden_layer_width,
    name="layer1",
    inducing_points=inducing_points
).to(device=train_x.device, dtype=train_x.dtype)
deep_gp = ToyDeepGP(hidden_layer_width, 1, train_x.size(-2), [hidden_gp], likelihood, name="output_layer",
                    inducing_points=num_inducing).to(device=train_x.device, dtype=train_x.dtype)

hidden_gp.variational_strategy.variational_distribution.variational_mean.data = \
    0.2 * torch.randn(hidden_gp.variational_strategy.variational_distribution.variational_mean.shape, device=train_x.device, dtype=train_x.dtype)
deep_gp.variational_strategy.variational_distribution.variational_mean.data = \
    0.2 * torch.randn(deep_gp.variational_strategy.variational_distribution.variational_mean.shape, device=train_x.device, dtype=train_x.dtype)

train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=80, shuffle=True)

optimizer = optim.Adam({"lr": 0.03, "betas": (0.90, 0.999)})

deep_gp.annealing = 0.1
hidden_gp.annealing = 0.1

USE_NF = True


# different settings for u/f sampling versus f sampling (u marginalized out)
deep_gp.EXACT = hidden_gp.EXACT = False
num_particles = 4 if deep_gp.EXACT else 32
annealing_epoch = 0 if deep_gp.EXACT else 100
use_nf_epoch = 0 if USE_NF else 999999
n_epochs = 300 if deep_gp.EXACT else 500

elbo = TraceMeanField_ELBO(num_particles=num_particles, vectorize_particles=True, max_plate_nesting=1)
svi = SVI(deep_gp.model, deep_gp.guide, optimizer, elbo)

def ll_rmse(x, y, num_samples=50):
    pred = deep_gp(x, num_samples=num_samples)[:, :, 0]
    log_prob = torch.distributions.Normal(pred, (-0.5 * deep_gp.log_beta).exp()).log_prob(y)
    log_prob = torch.logsumexp(log_prob - math.log(num_samples), dim=0).mean()
    rmse = (pred.mean(0) - y).pow(2.0).mean().sqrt().item()
    return log_prob, rmse

In [6]:
print("Beginning training in EXACT=%s mode with %d particles" % (deep_gp.EXACT, num_particles))

for epoch_i in range(n_epochs):
    epoch_loss = 0
    if epoch_i == annealing_epoch:
        deep_gp.annealing = 1.0
        hidden_gp.annealing = 1.0
        if epoch_i > 0:
            print("Turning off KL annealing...")
    
    if epoch_i == use_nf_epoch:
        hidden_gp.use_nf = True
        deep_gp.use_nf = True

    for minibatch_i, (x_batch, y_batch) in enumerate(train_loader):
#         print(list(hidden_gp.named_parameters()))
        loss = svi.step(x_batch, y_batch)
        epoch_loss = epoch_loss + loss / len(train_loader)
    if epoch_i % 5 == 0 or epoch_i == (n_epochs - 1):
        train_ll, train_rmse = ll_rmse(train_x, train_y)
        test_ll, test_rmse = ll_rmse(test_x, test_y)
        precision = pyro.param('log_beta').exp().item()
        frmt = "[epoch %03d] loss: %.4f  test_ll: %.3f  train_ll: %.3f  test_rmse: %.3f  train_rmse: %.3f  obs_prec: %.3f"
        print(frmt % (epoch_i, epoch_loss, test_ll, train_ll, test_rmse, train_rmse, precision))


Beginning training in EXACT=False mode with 32 particles
[epoch 000] loss: 3.2639  test_ll: -1.449  train_ll: -1.511  test_rmse: 0.939  train_rmse: 0.972  obs_prec: 2.026
[epoch 005] loss: 0.5269  test_ll: -0.458  train_ll: -0.449  test_rmse: 0.381  train_rmse: 0.377  obs_prec: 6.274
[epoch 010] loss: 0.4723  test_ll: -0.479  train_ll: -0.476  test_rmse: 0.392  train_rmse: 0.391  obs_prec: 6.852
[epoch 015] loss: 0.4568  test_ll: -0.422  train_ll: -0.403  test_rmse: 0.369  train_rmse: 0.363  obs_prec: 7.432
[epoch 020] loss: 0.4459  test_ll: -0.389  train_ll: -0.371  test_rmse: 0.356  train_rmse: 0.349  obs_prec: 7.239
[epoch 025] loss: 0.4367  test_ll: -0.395  train_ll: -0.370  test_rmse: 0.360  train_rmse: 0.351  obs_prec: 8.035
[epoch 030] loss: 0.4296  test_ll: -0.395  train_ll: -0.382  test_rmse: 0.359  train_rmse: 0.354  obs_prec: 7.432
[epoch 035] loss: 0.4244  test_ll: -0.407  train_ll: -0.387  test_rmse: 0.363  train_rmse: 0.355  obs_prec: 6.974
[epoch 040] loss: 0.4272  test_

  warn_if_nan(loss, "loss")


RuntimeError: 
                                                                               Trace Shapes:        
                                                                                Param Sites:        
                                                                                    log_beta       1
                                                               output_layer.gp_layer$$$means      50
                                                            output_layer.gp_layer$$$raw_vars      50
                                                            output_layer.gp_layer$$$log_beta       1
                                output_layer.gp_layer$$$variational_strategy.inducing_points 1 50  2
      output_layer.gp_layer$$$variational_strategy.variational_distribution.variational_mean    1 50
output_layer.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar 1 50 50
                                    output_layer.gp_layer$$$likelihood.noise_covar.raw_noise       1
                                                output_layer.gp_layer$$$mean_module.constant    1  1
                                        output_layer.gp_layer$$$covar_module.raw_outputscale       1
                            output_layer.gp_layer$$$covar_module.base_kernel.raw_lengthscale 1  1  2
                                                                     layer1.gp_layer$$$means      50
                                                                  layer1.gp_layer$$$raw_vars      50
                                      layer1.gp_layer$$$variational_strategy.inducing_points 2 50 18
            layer1.gp_layer$$$variational_strategy.variational_distribution.variational_mean    2 50
      layer1.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar 2 50 50
                                                      layer1.gp_layer$$$mean_module.constant    2  1
                                              layer1.gp_layer$$$covar_module.raw_outputscale       2
                                  layer1.gp_layer$$$covar_module.base_kernel.raw_lengthscale 2  1 18
                                                                               Sample Sites:        
                                                               num_particles_vectorized dist       |
                                                                                       value   32  |

In [None]:
%debug

In [None]:
torch.float64