In [1]:
%set_env CUDA_VISIBLE_DEVICES=1

from gpytorch.models.pyro_deep_gp import AbstractPyroHiddenGPLayer, AbstractPyroDeepGP
from gpytorch.variational import VariationalStrategy, CholeskyVariationalDistribution
from gpytorch.kernels import MaternKernel, ScaleKernel, RBFKernel
from gpytorch.means import ConstantMean
from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood

import torch

env: CUDA_VISIBLE_DEVICES=1


In [2]:
NUM_INDUCING = 128


class ToyHiddenGPLayer(AbstractPyroHiddenGPLayer):
    def __init__(self, input_dims, output_dims, name=""):
        inducing_points = torch.randn(output_dims, NUM_INDUCING, input_dims)
        
        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(-2),
            batch_size=output_dims
        )
        
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )
        
        super().__init__(variational_strategy, input_dims, output_dims, name)
        
        batch_shape = torch.Size([output_dims])
        
        self.mean_module = ConstantMean(batch_shape=batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(nu=2.5, batch_shape=batch_shape, ard_num_dims=input_dims), 
            batch_shape=batch_shape
        )
        
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)        

In [3]:
#TODO: Double inheritance
class ToyDeepGP(AbstractPyroDeepGP):
    def __init__(self, input_dims, output_dims, total_num_data, hidden_gp_layers, likelihood, name=""):
        inducing_points = torch.randn(output_dims, NUM_INDUCING, input_dims)
        
        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(-2),
            batch_size=output_dims
        )
        
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )
        
        super().__init__(
            variational_strategy,
            input_dims,
            output_dims,
            total_num_data,
            hidden_gp_layers,
            likelihood,
            name
        )
        
        batch_shape = torch.Size([output_dims])
        
        self.mean_module = ConstantMean(batch_shape=batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(nu=2.5, batch_shape=batch_shape, ard_num_dims=input_dims), 
            batch_shape=batch_shape
        )
    

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

In [4]:
import urllib.request
import os.path
from scipy.io import loadmat
from math import floor
import numpy as np

dataset_name = 'elevators'

data = torch.Tensor(loadmat(f'/home/jake.gardner/data/{dataset_name}.mat')['data'])
X = data[:, :-1]
y = data[:, -1]

N = data.shape[0]
np.random.seed(0)
data = data[np.random.permutation(np.arange(N)),:]

train_n = int(floor(0.8*len(X)))

train_x = X[:train_n, :].contiguous().cuda()
train_y = y[:train_n].contiguous().cuda()

test_x = X[train_n:, :].contiguous().cuda()
test_y = y[train_n:].contiguous().cuda()

mean = train_x.mean(dim=-2, keepdim=True)
std = train_x.std(dim=-2, keepdim=True) + 1e-6
train_x = (train_x - mean) / std
test_x = (test_x - mean) / std

mean,std = train_y.mean(),train_y.std()
train_y = (train_y - mean) / std
test_y = (test_y - mean) / std

In [5]:
print(f'Training data size: {train_x.size(-2)}, test data set: {test_x.size(-2)}, some float: {std:.3f}')

Training data size: 13279, test data set: 3320, some float: 0.253


In [6]:
likelihood = GaussianLikelihood()

hidden_gp = ToyHiddenGPLayer(train_x.size(-1), 10, name="layer1").cuda()
deep_gp = ToyDeepGP(10, 1, train_x.size(-2), [hidden_gp], likelihood, name="output_layer").cuda()

hidden_gp.eval()
deep_gp.eval()

ToyDeepGP(
  (variational_strategy): VariationalStrategy(
    (variational_distribution): CholeskyVariationalDistribution()
  )
  (likelihood): GaussianLikelihood(
    (quadrature): GaussHermiteQuadrature1D()
    (noise_covar): HomoskedasticNoise(
      (raw_noise_constraint): GreaterThan(1.000E-04)
    )
  )
  (mean_module): ConstantMean()
  (covar_module): ScaleKernel(
    (base_kernel): MaternKernel(
      (raw_lengthscale_constraint): Positive()
    )
    (raw_outputscale_constraint): Positive()
  )
)

In [7]:
state_dict = torch.load('trained_gp.pth')
deep_gp.load_state_dict(state_dict[0])
hidden_gp.load_state_dict(state_dict[1])

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [8]:
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

In [11]:
deep_gp.EXACT = False
hidden_gp.EXACT = False

In [12]:
from pyro.infer import SVI, TraceMeanField_ELBO, Trace_ELBO

from pyro import optim

optimizer = optim.Adam({"lr": 0.0, "betas": (0.96, 0.999)})


elbo = Trace_ELBO(num_particles=64, vectorize_particles=True, max_plate_nesting=1)
svi = SVI(deep_gp.model, deep_gp.guide, optimizer, elbo)

for epoch_i in range(30):
    epoch_loss = 0
    for minibatch_i, (x_batch, y_batch) in enumerate(train_loader):
        loss = svi.step(x_batch, y_batch)
        epoch_loss = epoch_loss + loss / len(train_loader)
        if minibatch_i % 1 == 0:
            print(f'Epoch {epoch_i}, [{minibatch_i} / {len(train_loader)}], loss = {loss:.3f}, last epoch loss = {epoch_loss:.3f}')
#         break

Epoch 0, [0 / 13], loss = 0.056, last epoch loss = 0.004
Epoch 0, [1 / 13], loss = 0.055, last epoch loss = 0.009
Epoch 0, [2 / 13], loss = 0.055, last epoch loss = 0.013
Epoch 0, [3 / 13], loss = 0.055, last epoch loss = 0.017
Epoch 0, [4 / 13], loss = 0.056, last epoch loss = 0.021
Epoch 0, [5 / 13], loss = 0.055, last epoch loss = 0.026
Epoch 0, [6 / 13], loss = 0.055, last epoch loss = 0.030
Epoch 0, [7 / 13], loss = 0.056, last epoch loss = 0.034
Epoch 0, [8 / 13], loss = 0.055, last epoch loss = 0.038
Epoch 0, [9 / 13], loss = 0.055, last epoch loss = 0.043
Epoch 0, [10 / 13], loss = 0.055, last epoch loss = 0.047
Epoch 0, [11 / 13], loss = 0.056, last epoch loss = 0.051
Epoch 0, [12 / 13], loss = 0.056, last epoch loss = 0.055
Epoch 1, [0 / 13], loss = 0.057, last epoch loss = 0.004
Epoch 1, [1 / 13], loss = 0.056, last epoch loss = 0.009
Epoch 1, [2 / 13], loss = 0.055, last epoch loss = 0.013
Epoch 1, [3 / 13], loss = 0.056, last epoch loss = 0.017
Epoch 1, [4 / 13], loss = 0.

KeyboardInterrupt: 

In [14]:
x=41602.5156+5616.2090
y=737.793
print(x/y)

63.99996286221203


In [14]:
x_batch, y_batch = next(iter(train_loader))

In [13]:
from pyro.poutine import trace, replay

guide_trace = trace(deep_gp.guide).get_trace(x_batch, y_batch)

model_trace = trace(replay(deep_gp.model, guide_trace)).get_trace(x_batch, y_batch)

for site in model_trace:
    print("model site", site)
for site in guide_trace:
    print("guide site", site)

model site _INPUT
model site output_layer.gp_layer$$$variational_strategy.inducing_points
model site output_layer.gp_layer$$$variational_strategy.variational_distribution.variational_mean
model site output_layer.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar
model site output_layer.gp_layer$$$likelihood.noise_covar.raw_noise
model site output_layer.gp_layer$$$mean_module.constant
model site output_layer.gp_layer$$$covar_module.raw_outputscale
model site output_layer.gp_layer$$$covar_module.base_kernel.raw_lengthscale
model site layer1.gp_layer$$$variational_strategy.inducing_points
model site layer1.gp_layer$$$variational_strategy.variational_distribution.variational_mean
model site layer1.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar
model site layer1.gp_layer$$$mean_module.constant
model site layer1.gp_layer$$$covar_module.raw_outputscale
model site layer1.gp_layer$$$covar_module.base_kernel.raw_lengthscale
model site

In [14]:
deep_gp.EXACT = True
hidden_gp.EXACT = True

guide_trace = trace(deep_gp.guide).get_trace(x_batch, y_batch)

good_model_trace = trace(replay(deep_gp.model, guide_trace)).get_trace(x_batch, y_batch)

for site in good_model_trace:
    print("model site", site)
for site in guide_trace:
    print("guide site", site)

model site _INPUT
model site output_layer.gp_layer$$$variational_strategy.inducing_points
model site output_layer.gp_layer$$$variational_strategy.variational_distribution.variational_mean
model site output_layer.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar
model site output_layer.gp_layer$$$likelihood.noise_covar.raw_noise
model site output_layer.gp_layer$$$mean_module.constant
model site output_layer.gp_layer$$$covar_module.raw_outputscale
model site output_layer.gp_layer$$$covar_module.base_kernel.raw_lengthscale
model site layer1.gp_layer$$$variational_strategy.inducing_points
model site layer1.gp_layer$$$variational_strategy.variational_distribution.variational_mean
model site layer1.gp_layer$$$variational_strategy.variational_distribution.chol_variational_covar
model site layer1.gp_layer$$$mean_module.constant
model site layer1.gp_layer$$$covar_module.raw_outputscale
model site layer1.gp_layer$$$covar_module.base_kernel.raw_lengthscale
model site

In [15]:
#print(model_trace.nodes['layer1.inducing_values']['value'].shape)
good_model_trace.compute_log_prob()
print(512*good_model_trace.nodes['output_layer.output_value']['log_prob'].mean())
#print(guide_trace.nodes['layer1.inducing_values']['value'].shape)
#print(guide_trace.nodes['output_layer.inducing_values']['value'].shape)

tensor(-2.7838e-12, device='cuda:0', grad_fn=<MulBackward0>)


In [16]:
#print(model_trace.nodes['layer1.inducing_values']['value'].shape)
model_trace.compute_log_prob()
print((512*model_trace.nodes['output_layer.output_value']['log_prob']).mean())
#print(guide_trace.nodes['layer1.inducing_values']['value'].shape)
#print(guide_trace.nodes['output_layer.inducing_values']['value'].shape)

tensor(-3.8091e-12, device='cuda:0', grad_fn=<MeanBackward0>)


In [29]:
hidden_gp.variational_distribution.covariance_matrix[0]

tensor([[ 0.0734,  0.0050, -0.0023,  ...,  0.0046,  0.0047,  0.0040],
        [ 0.0050,  0.0494, -0.0011,  ...,  0.0014, -0.0038,  0.0036],
        [-0.0023, -0.0011,  0.0538,  ..., -0.0032, -0.0021, -0.0057],
        ...,
        [ 0.0046,  0.0014, -0.0032,  ...,  0.0954, -0.0039, -0.0008],
        [ 0.0047, -0.0038, -0.0021,  ..., -0.0039,  0.0929,  0.0027],
        [ 0.0040,  0.0036, -0.0057,  ..., -0.0008,  0.0027,  0.0656]],
       device='cuda:0', grad_fn=<SelectBackward>)

In [14]:
torch.cat(samples).squeeze(-1)

tensor([[-0.7938, -0.1091, -0.8158,  ...,  0.3406, -2.0006,  1.1564],
        [ 0.2544, -0.2884, -0.8945,  ...,  0.9977, -0.5145,  1.2859],
        [ 0.5308,  0.1070,  0.8624,  ...,  1.1606,  0.9395,  0.2596],
        ...,
        [-0.1504,  0.0851, -1.1425,  ...,  0.3579,  0.2761,  1.4231],
        [ 0.0246, -1.6314, -0.5040,  ..., -1.0291, -1.5203,  0.1858],
        [-1.1408, -0.9031, -0.9281,  ..., -0.3486, -0.7034,  0.9233]],
       device='cuda:0')

In [15]:
targetmean = torch.cat(samples).mean(0)
targetstd = torch.cat(samples).std(0)

In [18]:
deep_gp.variational_strategy(deep_gp.debug_inputs).variance.sqrt().squeeze()[:10].data.cpu().numpy().tolist()

[0.5476316213607788,
 0.5202265381813049,
 0.6973344683647156,
 0.5178024172782898,
 0.6270923614501953,
 0.6165500283241272,
 0.6566213965415955,
 0.6366702318191528,
 0.6930236220359802,
 0.5574667453765869]

In [19]:
targetstd.squeeze()[:10].data.cpu().numpy().tolist()

[0.5367881655693054,
 0.5097756385803223,
 0.6767644286155701,
 0.5547028183937073,
 0.6438401341438293,
 0.600436270236969,
 0.6482633948326111,
 0.6319460272789001,
 0.7010195851325989,
 0.5514597296714783]

In [30]:
deep_gp.mean_module.constant

Parameter containing:
tensor([[-0.4946]], device='cuda:0', requires_grad=True)

In [40]:
samples2 = []
for i in range(500):
    with torch.no_grad():
        sample = ToyHiddenGPLayer.model(deep_gp, inputs[0], return_samples=True)
        samples2.append(sample)

In [19]:
deep_gp.variational_strategy(inputs[0])

-0.0006227144040167332

In [20]:
hidden_gp.variational_strategy.variational_distribution.variational_mean

Parameter containing:
tensor([[-0.3868, -0.5244,  0.2853,  ..., -1.2279, -0.3916, -0.5635],
        [-0.0413, -0.1444, -0.0837,  ..., -0.0363, -0.0189,  0.0718],
        [-0.0035, -0.0599,  0.0911,  ...,  0.1380,  0.3721,  0.4495],
        ...,
        [ 0.0637, -0.3526, -0.0740,  ..., -0.0193, -0.0448,  0.0386],
        [-0.2438, -0.2021,  0.0399,  ..., -0.0111, -0.1915, -0.1610],
        [-1.3683, -1.5571,  0.9494,  ...,  0.8288, -0.7905, -0.8687]],
       device='cuda:0', requires_grad=True)

In [21]:
torch.save([deep_gp.state_dict(), hidden_gp.state_dict()], 'trained_gp.pth')

In [22]:
state_dict

[OrderedDict([('variational_strategy.inducing_points',
               tensor([[[ 0.2636, -0.3732,  0.3966,  ..., -0.0683, -0.3187,  0.2445],
                        [-0.9679,  0.1655,  0.2087,  ...,  0.0339, -0.2558,  0.1049],
                        [-0.4833,  0.6904,  0.4577,  ...,  0.2560,  0.5616,  0.4499],
                        ...,
                        [-0.6785,  0.1705, -0.3937,  ...,  0.7312,  0.9520,  0.4070],
                        [ 1.2968,  1.3922, -0.0182,  ...,  0.4934, -1.0872,  2.0364],
                        [-0.4223, -0.1983, -0.2489,  ...,  0.0747,  0.8132, -1.8235]]],
                      device='cuda:0')),
              ('variational_strategy.variational_params_initialized',
               tensor(1, device='cuda:0')),
              ('variational_strategy.variational_distribution.variational_mean',
               tensor([[ 1.3314,  0.7089,  1.9481, -0.3457,  0.6118,  1.9193,  1.8307, -0.3567,
                        -0.2294,  1.1335,  0.2448, -0.6334,  1.386

In [36]:
import gc

In [38]:
gc.collect()

0