In [1]:
# Import our GPyTorch library
import gpytorch

# Import some classes we will use from torch
from torch.autograd import Variable
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader

# While we have theoretically fast algorithms for toeplitz matrix-vector multiplication, the hardware of GPUs
# is so well-designed that naive multiplication on them beats the current implementation of our algorith (despite
# theoretically fast computation). Because of this, we set the use_toeplitz flag to false to minimize runtime
gpytorch.functions.use_toeplitz = False

In [2]:
# Import datasets to access MNISTS and transforms to format data for learning
from torchvision import transforms, datasets

# Download and load the MNIST dataset to train on
# Compose lets us do multiple transformations. Specically make the data a torch.FloatTensor of shape
# (colors x height x width) in the range [0.0, 1.0] as opposed to an RGB image with shape (height x width x colors)
# then normalize using  mean (0.1317) and standard deviation (0.3081) already calculated (not here)

# Transformation documentation here: http://pytorch.org/docs/master/torchvision/transforms.html
train_dataset = datasets.MNIST('/tmp', train=True, download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))
test_dataset = datasets.MNIST('/tmp', train=False, download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))
                              ]))

# But the data into a DataLoader. We shuffle the training data but not the test data because the order
# training data is presented will affect the outcome unlike the test data
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

## Define the feature extractor for our deep kernel

In [3]:
# Import torch's neural network
# Documentation here: http://pytorch.org/docs/master/nn.html
from torch import nn
# Import torch.nn.functional for various activation/pooling functions
# Documentation here: http://pytorch.org/docs/master/nn.html#torch-nn-functional
from torch.nn import functional as F

# We make a classic LeNet Architecture sans a final prediction layer to 10 outputs. This will serve as a feature
# extractor reducing the dimensionality of our data down to 64. We will pretrain these layers by adding on a 
# final classifying 64-->10 layer
# https://medium.com/@siddharthdas_32104/cnns-architectures-lenet-alexnet-vgg-googlenet-resnet-and-more-666091488df5
class LeNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(LeNetFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2)
        self.norm2 = nn.BatchNorm2d(32)
        self.fc3 = nn.Linear(32 * 7 * 7, 64)
        self.norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.norm1(self.conv1(x))), 2)
        x = F.max_pool2d(F.relu(self.norm2(self.conv2(x))), 2)
        x = x.view(-1, 32 * 7 * 7)
        x = F.relu(self.norm3(self.fc3(x)))
        return x
    
feature_extractor = LeNetFeatureExtractor().cuda()

### Pretrain the feature extractor a bit

In [4]:
# Make a final classifier layer that operates on the feature extractor's output
classifier = nn.Linear(64, 10).cuda()
# Make list of parameters to optimize (both the parameters of the feature extractor and classifier)
params = list(feature_extractor.parameters()) + list(classifier.parameters())
# We train the network using stochastic gradient descent
optimizer = SGD(params, lr=0.1, momentum=0.9)

# Define our pretraining function
#    Set feature extractor to train mode (need b/c module unlike classifier which is just a single layer)
#    iterate through train_loader
#    put the data on the GPU as a variable
#    Zero out the gradients from/for back_prop (needed b/c otherwise would hurt RNNs by default)
#    Extract the 64-dimensional feature vector
#    Feed the features into the classifying layer and output the log softmax
#    Calculate negative log likelihood loss
#    COULD REPLACE ABOVE WITH torch.nn.functional.cross_entropy? Says it combines them
#    Backprop
#    Incrementally optimize parameters
#    Accumulate training loss
#    Print result of epoch
def pretrain(epoch):
    feature_extractor.train()
    train_loss = 0.
    for data, target in train_loader:
        #data, target = data.cuda(), target.cuda()
        #data, target = Variable(data), Variable(target)
        data, target = Variable(data.cuda()), Variable(target.cuda())
        optimizer.zero_grad()
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0] * len(data)
    print('Train Epoch: %d\tLoss: %.6f' % (epoch, train_loss / len(train_dataset)))

# Set feature extractor to eval mode (these should actually only effect Dropout and BatchNorm which we aren't?)
# http://pytorch.org/docs/master/nn.html#torch.nn.Module.train
# Set test_loss accumulator and correct counter
# Iterate through test data
#    volatile is something about not saving gradients because not needed in test mode? Basically just not
#          storing some type of information. Makes sense
#    calculate loss and accumulate
#    make prediction and check accuracy
def pretest():
    feature_extractor.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 3
for epoch in range(1, n_epochs + 1):
    pretrain(epoch)
    pretest()

Train Epoch: 1	Loss: 0.151788
Test set: Average loss: 0.0388, Accuracy: 9882/10000 (98.820%)
Train Epoch: 2	Loss: 0.036721
Test set: Average loss: 0.0341, Accuracy: 9885/10000 (98.850%)
Train Epoch: 3	Loss: 0.025171
Test set: Average loss: 0.0375, Accuracy: 9876/10000 (98.760%)


## Define the deep kernel GP

In [5]:
# now this is our first exposure to the usefulness of gpytorch

# A gpytorch module is superclass of torch.nn.Module
class DKLModel(gpytorch.Module):
    def __init__(self, feature_extractor, n_features=64, grid_bounds=(-10., 10.)):
        super(DKLModel, self).__init__()
        # We add the feature-extracting network to the class
        self.feature_extractor = feature_extractor
        # The latent function is what transforms the features into the output
        self.latent_functions = LatentFunctions(n_features=n_features, grid_bounds=grid_bounds)
        # The grid bounds are the range we expect the features to fall into
        self.grid_bounds = grid_bounds
        # n_features in the dimension of the vector extracted (64)
        self.n_features = n_features
    
    def forward(self, x):
        # For the forward method of the Module, first feed the xdata through the
        # feature extraction network
        features = self.feature_extractor(x)
        # Scale to fit inside grid bounds
        features = gpytorch.utils.scale_to_bounds(features, self.grid_bounds[0], self.grid_bounds[1])
        # The result is hte output of the latent functions
        res = self.latent_functions(features.unsqueeze(-1))
        return res
    
# The AdditiveGridInducingVariationalGP trains multiple GPs on the features
# These are mixed together by the likelihoo function to generate the final
# classification output

# Grid bounds specify the allowed values of features
# grid_size is the number of subdivisions along each dimension
class LatentFunctions(gpytorch.models.AdditiveGridInducingVariationalGP):
    # n_features is the number of features from feature extractor
    # mixing params = False means the result of the GPs will simply be summed instead of mixed
    def __init__(self, n_features=64, grid_bounds=(-10., 10.), grid_size=128):
        super(LatentFunctions, self).__init__(grid_size=grid_size, grid_bounds=[grid_bounds],
                                              n_components=n_features, mixing_params=False, sum_output=False)
        #  We will use the very common universal approximator RBF Kernel
        cov_module = gpytorch.kernels.RBFKernel()
        # Initialize the lengthscale of the kernel
        cov_module.initialize(log_lengthscale=0)
        self.cov_module = cov_module
        self.grid_bounds = grid_bounds
        
    def forward(self, x):
        # Zero mean
        mean = Variable(x.data.new(len(x)).zero_())
        # Covariance using RBF kernel as described in __init__
        covar = self.cov_module(x)
        # Return as Gaussian
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)
    
# Intialize the model  
model = DKLModel(feature_extractor).cuda()
# Choose that likelihood function to use
# Here we use the softmax likelihood (e^z_i)/SUM_over_i(e^z_i)
# https://en.wikipedia.org/wiki/Softmax_function
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(n_features=model.n_features, n_classes=10).cuda()

In [6]:
# Simple DataLoader
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, pin_memory=True)
# We use an adam optimizer over both the model and likelihood parameters
# https://arxiv.org/abs/1412.6980
optimizer = Adam(list(model.parameters()) + list(likelihood.parameters()), lr=0.01)
#optimizer = Adam(list(model.parameters()), lr=0.01)

def train(epoch):
    model.train()
    likelihood.train()
    
    train_loss = 0.
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = -model.latent_functions.marginal_log_likelihood(likelihood, output, target, n_data=len(train_dataset))
        loss.backward()
        optimizer.step()
        print('Train Epoch: %d [%03d/%03d], Loss: %.6f' % (epoch, batch_idx + 1, len(train_loader), loss.data[0]))

def test():
    model.eval()
    likelihood.eval()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = likelihood(model(data))
        pred = output.argmax()
        correct += pred.eq(target.view_as(pred)).data.cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 10
for epoch in range(1, n_epochs + 1):
    %time train(epoch)
    test()

Train Epoch: 1 [001/030], Loss: 41.579403
Train Epoch: 1 [002/030], Loss: 92.042076
Train Epoch: 1 [003/030], Loss: 73.519318
Train Epoch: 1 [004/030], Loss: 42.514950
Train Epoch: 1 [005/030], Loss: 37.503212
Train Epoch: 1 [006/030], Loss: 35.392902
Train Epoch: 1 [007/030], Loss: 29.345600
Train Epoch: 1 [008/030], Loss: 59.559193
Train Epoch: 1 [009/030], Loss: 30.034393
Train Epoch: 1 [010/030], Loss: 22.256338
Train Epoch: 1 [011/030], Loss: 42.953094
Train Epoch: 1 [012/030], Loss: 16.092073
Train Epoch: 1 [013/030], Loss: 14.936320
Train Epoch: 1 [014/030], Loss: 12.845394
Train Epoch: 1 [015/030], Loss: 12.620340
Train Epoch: 1 [016/030], Loss: 19.153234
Train Epoch: 1 [017/030], Loss: 9.901564
Train Epoch: 1 [018/030], Loss: 9.940659
Train Epoch: 1 [019/030], Loss: 7.963514
Train Epoch: 1 [020/030], Loss: 9.141623
Train Epoch: 1 [021/030], Loss: 13.696498
Train Epoch: 1 [022/030], Loss: 6.201659
Train Epoch: 1 [023/030], Loss: 6.377803
Train Epoch: 1 [024/030], Loss: 7.014893

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 7907/10000 (79.070%)
Train Epoch: 2 [001/030], Loss: 5.199722
Train Epoch: 2 [002/030], Loss: 3.798968
Train Epoch: 2 [003/030], Loss: 10.037350
Train Epoch: 2 [004/030], Loss: 3.029057
Train Epoch: 2 [005/030], Loss: 3.282596
Train Epoch: 2 [006/030], Loss: 3.209903
Train Epoch: 2 [007/030], Loss: 2.640893
Train Epoch: 2 [008/030], Loss: 2.577655
Train Epoch: 2 [009/030], Loss: 2.529065
Train Epoch: 2 [010/030], Loss: 2.375546
Train Epoch: 2 [011/030], Loss: 2.017558
Train Epoch: 2 [012/030], Loss: 1.960126
Train Epoch: 2 [013/030], Loss: 2.555728
Train Epoch: 2 [014/030], Loss: 1.938028
Train Epoch: 2 [015/030], Loss: 1.651893
Train Epoch: 2 [016/030], Loss: 3.425699
Train Epoch: 2 [017/030], Loss: 1.628858
Train Epoch: 2 [018/030], Loss: 1.355417
Train Epoch: 2 [019/030], Loss: 1.539618
Train Epoch: 2 [020/030], Loss: 2.777413
Train Epoch: 2 [021/030], Loss: 1.441511
Train Epoch: 2 [022/030], Loss: 1.551285
Train Epoch: 2 [023/030], Loss: 1.

Test set: Average loss: 0.0000, Accuracy: 9916/10000 (99.160%)
Train Epoch: 8 [001/030], Loss: -0.040436
Train Epoch: 8 [002/030], Loss: -0.026243
Train Epoch: 8 [003/030], Loss: -0.020521
Train Epoch: 8 [004/030], Loss: -0.032855
Train Epoch: 8 [005/030], Loss: -0.027933
Train Epoch: 8 [006/030], Loss: -0.036021
Train Epoch: 8 [007/030], Loss: -0.021509
Train Epoch: 8 [008/030], Loss: -0.040184
Train Epoch: 8 [009/030], Loss: -0.038900
Train Epoch: 8 [010/030], Loss: -0.032960
Train Epoch: 8 [011/030], Loss: -0.013044
Train Epoch: 8 [012/030], Loss: -0.027975
Train Epoch: 8 [013/030], Loss: -0.049556
Train Epoch: 8 [014/030], Loss: -0.022028
Train Epoch: 8 [015/030], Loss: 0.021581
Train Epoch: 8 [016/030], Loss: -0.013311
Train Epoch: 8 [017/030], Loss: -0.013941
Train Epoch: 8 [018/030], Loss: 0.015906
Train Epoch: 8 [019/030], Loss: -0.025901
Train Epoch: 8 [020/030], Loss: -0.040361
Train Epoch: 8 [021/030], Loss: -0.042751
Train Epoch: 8 [022/030], Loss: -0.031754
Train Epoch: 8 