# Exact GP Regression with Multiple GPUs and Kernel Partitioning

In [1]:
import math
import torch
import gpytorch
from matplotlib import pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Protein dataset
We will be using the Protein UCI dataset which contains a total of 40000+ data points.

In [2]:
import os
import urllib.request
from scipy.io import loadmat
dataset = 'protein'
if not os.path.isfile(f'{dataset}.mat'):
    print(f'Downloading \'{dataset}\' UCI dataset...')
    urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1nRb8e7qooozXkNghC5eQS0JeywSXGX2S',
                               f'{dataset}.mat')
    
data = torch.Tensor(loadmat(f'{dataset}.mat')['data'])

In [3]:
import numpy as np

N = data.shape[0]
# make train/val/test
n_train = int(0.8 * N)
train_x, train_y = data[:n_train, :-1], data[:n_train, -1]
test_x, test_y = data[n_train:, :-1], data[n_train:, -1]

# normalize features
mean = train_x.mean(dim=-2, keepdim=True)
std = train_x.std(dim=-2, keepdim=True) + 1e-6 # prevent dividing by 0
train_x = (train_x - mean) / std
test_x = (test_x - mean) / std

# normalize labels
mean, std = train_y.mean(),train_y.std()
train_y = (train_y - mean) / std
test_y = (test_y - mean) / std

# make continguous
train_x, train_y = train_x.contiguous(), train_y.contiguous()
test_x, test_y = test_x.contiguous(), test_y.contiguous()

The following function will find the approximately largest kernel partition size that will allow an exact GP to be trained on the dataset using the specified number of GPUs.

In [4]:
import gc

def find_best_gpu_setting(train_x,
                          train_y,
                          n_devices,
                          output_device,
                          preconditioner_size
):
    N = train_x.size(0)
    
    # Find the optimum partition/checkpoint size by decreasing in powers of 2
    # Start with no partitioning (size = 0)
    settings = [0] + [int(n) for n in np.ceil(N / 2**np.arange(1, np.floor(np.log2(N))))]

    for checkpoint_size in settings:
        print(f'Number of devices: {n_devices} -- Kernel partition size: {checkpoint_size}')
        try:
            # Try a full forward and backward pass with this setting to check memory usage
            _, _ = train(train_x, train_y,
                         n_devices=n_devices, output_device=output_device,
                         checkpoint_size=checkpoint_size,
                         preconditioner_size=preconditioner_size, n_training_iter=1)
            
            # when successful, break out of for-loop and jump to finally block
            break
        except RuntimeError as e:
            print(f'RuntimeError: {e}')
        except AttributeError as e:
            print(f'AttributeError: {e}')
        finally:
            # handle CUDA OOM error
            gc.collect()
            torch.cuda.empty_cache()
    return checkpoint_size

In [5]:
# We will use the simplest form of GP model with an exact kernel
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        
        self.covar_module = gpytorch.kernels.MultiDeviceKernel(
            base_covar_module, device_ids=range(torch.cuda.device_count()),
            output_device=output_device
        )
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [6]:
def train(train_x,
          train_y,
          n_devices,
          output_device,
          checkpoint_size,
          preconditioner_size,
          n_training_iter,
):
    likelihood = gpytorch.likelihoods.GaussianLikelihood().to(output_device)
    model = ExactGPModel(train_x, train_y, likelihood).to(output_device)
    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=0.1)
    
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(n_training_iter):
        optimizer.zero_grad()
 
        with gpytorch.beta_features.checkpoint_kernel(checkpoint_size), \
             gpytorch.settings.max_preconditioner_size(preconditioner_size):

            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()

        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            i + 1, n_training_iter, loss.item(),
            model.covar_module.module.base_kernel.lengthscale.item(),
            model.likelihood.noise.item()
        ))
        optimizer.step()
    
    print(f"Finished training on {train_x.size(0)} data points using {n_devices} GPUs.")
    return model, likelihood

In [7]:
output_device = torch.device('cuda:0')

train_x, train_y = train_x.to(output_device), train_y.to(output_device)
test_x, test_y = test_x.to(output_device), test_y.to(output_device)

In [8]:
n_devices = torch.cuda.device_count()
# Set a large enough preconditioner size to reduce the number of CG iterations run
preconditioner_size = 100
checkpoint_size = find_best_gpu_setting(train_x, train_y,
                                        n_devices=n_devices, 
                                        output_device=output_device,
                                        preconditioner_size=preconditioner_size)

Number of devices: 1 -- Kernel partition size: 0
RuntimeError: CUDA out of memory. Tried to allocate 4.99 GiB (GPU 0; 10.91 GiB total capacity; 4.99 GiB already allocated; 4.76 GiB free; 233.00 KiB cached)
Number of devices: 1 -- Kernel partition size: 18292
RuntimeError: CUDA out of memory. Tried to allocate 2.49 GiB (GPU 0; 10.91 GiB total capacity; 7.49 GiB already allocated; 2.22 GiB free; 410.50 KiB cached)
Number of devices: 1 -- Kernel partition size: 9146
Iter 1/1 - Loss: 1.070   lengthscale: 0.693   noise: 0.693
Finished training on 36584 data points using 1 GPUs.


# Now we train

In [9]:
model, likelihood = train(train_x, train_y,
                          n_devices=n_devices, output_device=output_device,
                          checkpoint_size=checkpoint_size,
                          preconditioner_size=preconditioner_size,
                          n_training_iter=50)

Iter 1/50 - Loss: 1.069   lengthscale: 0.693   noise: 0.693
Iter 2/50 - Loss: 1.052   lengthscale: 0.644   noise: 0.644
Iter 3/50 - Loss: 1.033   lengthscale: 0.599   noise: 0.598
Iter 4/50 - Loss: 1.017   lengthscale: 0.555   noise: 0.554
Iter 5/50 - Loss: 1.000   lengthscale: 0.516   noise: 0.513
Iter 6/50 - Loss: 0.986   lengthscale: 0.482   noise: 0.474
Iter 7/50 - Loss: 0.973   lengthscale: 0.453   noise: 0.438
Iter 8/50 - Loss: 0.957   lengthscale: 0.431   noise: 0.404
Iter 9/50 - Loss: 0.946   lengthscale: 0.415   noise: 0.373
Iter 10/50 - Loss: 0.934   lengthscale: 0.404   noise: 0.344
Iter 11/50 - Loss: 0.924   lengthscale: 0.394   noise: 0.317
Iter 12/50 - Loss: 0.916   lengthscale: 0.384   noise: 0.292
Iter 13/50 - Loss: 0.911   lengthscale: 0.371   noise: 0.270
Iter 14/50 - Loss: 0.906   lengthscale: 0.354   noise: 0.250
Iter 15/50 - Loss: 0.903   lengthscale: 0.337   noise: 0.233
Iter 16/50 - Loss: 0.897   lengthscale: 0.319   noise: 0.217
Iter 17/50 - Loss: 0.895   length

# Finally we test the root-mean-square error

In [14]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

with torch.no_grad(), \
     gpytorch.settings.fast_pred_var(), \
     gpytorch.beta_features.checkpoint_kernel(checkpoint_size), \
     gpytorch.settings.max_preconditioner_size(preconditioner_size):
    
    latent_pred = model(test_x)
    observed_pred = likelihood(latent_pred)
    test_rmse = torch.sqrt(torch.mean(torch.pow(observed_pred.mean - test_y, 2)))

In [15]:
print(f"Test RMSE: {test_rmse.item()}")

Test RMSE: 0.5553563237190247
