This example shows how to use deep kernel learning (DKL) for classification. This is useful when you have very complex high-dimensional inputs (such as an image)

The example here is MNIST classification

For an introduction to DKL see these papers:
https://arxiv.org/abs/1511.02222
https://arxiv.org/abs/1611.00336

In [29]:
# Import our GPyTorch library
import gpytorch

# Import some classes we will use from torch
from torch.autograd import Variable
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader

import torch
from torch import nn
from gpytorch.utils.lanczos_bidiagonalize import LanczosBidiagonalize

In [3]:
# Import datasets to access MNISTS and transforms to format data for learning
from torchvision import transforms, datasets

# Download and load the MNIST dataset to train on
# Compose lets us do multiple transformations. Specically make the data a torch.FloatTensor of shape
# (colors x height x width) in the range [0.0, 1.0] as opposed to an RGB image with shape (height x width x colors)
# then normalize using  mean (0.1317) and standard deviation (0.3081) already calculated (not here)

# Transformation documentation here: http://pytorch.org/docs/master/torchvision/transforms.html
train_dataset = datasets.MNIST('/tmp', train=True, download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))
test_dataset = datasets.MNIST('/tmp', train=False, download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))
                              ]))

# But the data into a DataLoader. We shuffle the training data but not the test data because the order
# training data is presented will affect the outcome unlike the test data
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

In [32]:
from torchvision.models import vgg
model = vgg.vgg19(pretrained=True).cuda()

In [33]:
foo = model.features[21]

In [34]:
foo.out_channels

512

In [104]:
def conv_to_dense(conv_layer, probe_img, rank):
    # probe_img should be 1 x in_channels x imgw x imgh
    imgw = probe_img.size(-2)
    imgh = probe_img.size(-1)
    in_channels = conv_layer.in_channels
    out_channels = conv_layer.out_channels
    kernel_size = conv_layer.kernel_size
    stride = conv_layer.stride
    padding = conv_layer.padding
    
    probe_vector = probe_img.view(in_channels * imgw * imgh)
    
    new_conv_layer = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, stride=stride, bias=False).cuda()
    new_conv_layer.load_state_dict(conv_layer.state_dict(), strict=False)
    new_conv_layer_t = nn.ConvTranspose2d(out_channels, in_channels, kernel_size=kernel_size, padding=padding, stride=stride, bias=False).cuda()
    new_conv_layer_t.load_state_dict(new_conv_layer.state_dict(), strict=False)
    
    matmul_closure = lambda v: new_conv_layer(Variable(v.contiguous().view(1, in_channels, imgw, imgh)).cuda()).data
    matmul_t_closure = lambda v: new_conv_layer_t(Variable(v.contiguous().view(1, out_channels, imgw, imgh)).cuda()).data
    
    lb = LanczosBidiagonalize(max_iter=rank, cls=type(probe_vector))
    n_rows = out_channels * imgw * imgh
    n_cols = in_channels * imgw * imgh
    P, B, Q = lb.lanczos_bidiagonalize(matmul_closure, matmul_t_closure, probe_vector, n_rows, n_cols)
    return P, B, Q, new_conv_layer

In [128]:
probe_img = torch.randn(1, 512, 9, 9).cuda()
P, B, Q, new_conv_layer = conv_to_dense(model.features[21].cuda(), probe_img, 800)

  return self.sub(other)


In [129]:
test_img = torch.rand(1, 512, 9, 9).cuda()
my_out = P.matmul(B.matmul(Q.t().matmul(test_img.view(512*9*9)))).view(1, 512, 9, 9)
real_out = new_conv_layer(Variable(test_img).cuda()).data

In [130]:
my_out[0, 0]


-1.7855 -2.5733 -3.0703 -3.1339 -2.9088 -2.8971 -2.9612 -2.8048 -1.7661
-2.8222 -4.2871 -4.8346 -4.8978 -4.7709 -4.9101 -4.5188 -4.4545 -3.0383
-2.6252 -4.3977 -4.6451 -4.5744 -4.2396 -4.3563 -4.0236 -4.3268 -3.0988
-2.4851 -4.4187 -4.4565 -4.4548 -4.0000 -4.3433 -4.3857 -4.4935 -2.8816
-2.5956 -4.1987 -4.1852 -4.0755 -3.2991 -4.1004 -4.5548 -4.9095 -3.0058
-2.5259 -3.9930 -4.4466 -4.4880 -3.5658 -4.1926 -4.3241 -4.5980 -2.8738
-2.4565 -3.8954 -4.0036 -4.3539 -4.0968 -4.3666 -3.8871 -3.9920 -2.5937
-2.7292 -4.3585 -4.3107 -4.3678 -4.2729 -4.2637 -3.8704 -4.0373 -2.5673
-1.8696 -2.9644 -2.8937 -2.9995 -2.9951 -2.8629 -2.5754 -2.5467 -1.6593
[torch.cuda.FloatTensor of size 9x9 (GPU 0)]

In [131]:
real_out[0, 0]


-1.6695 -3.0159 -2.7022 -2.8106 -3.0404 -3.5181 -2.9043 -2.7308 -2.7054
-2.3611 -4.4625 -3.8304 -3.9910 -4.1196 -4.0219 -3.6126 -3.2272 -3.2531
-2.8150 -3.8669 -3.7856 -4.4363 -4.5302 -4.1949 -3.5616 -3.8418 -3.1242
-2.3715 -3.7058 -3.5787 -4.1800 -3.7834 -3.5073 -3.4099 -3.9138 -2.9160
-2.4590 -3.8683 -3.5172 -3.5815 -3.6392 -4.1192 -3.8915 -3.7258 -3.3866
-2.5089 -3.9754 -3.2735 -3.7530 -4.1245 -3.4110 -3.2895 -3.2688 -3.4373
-2.5451 -3.7677 -3.4878 -4.2226 -4.4720 -3.9447 -3.4788 -3.8035 -3.1894
-2.5814 -3.2681 -3.7609 -4.0304 -4.2612 -3.8233 -3.7195 -3.8331 -3.1859
-1.6157 -2.2874 -2.9639 -2.4920 -2.7240 -2.9744 -3.4825 -3.0419 -2.1320
[torch.cuda.FloatTensor of size 9x9 (GPU 0)]

## Define the feature extractor for our deep kernel

In [40]:
# Import torch's neural network
# Documentation here: http://pytorch.org/docs/master/nn.html
from torch import nn
# Import torch.nn.functional for various activation/pooling functions
# Documentation here: http://pytorch.org/docs/master/nn.html#torch-nn-functional
from torch.nn import functional as F

# We make a classic LeNet Architecture sans a final prediction layer to 10 outputs. This will serve as a feature
# extractor reducing the dimensionality of our data down to 64. We will pretrain these layers by adding on a 
# final classifying 64-->10 layer
# https://medium.com/@siddharthdas_32104/cnns-architectures-lenet-alexnet-vgg-googlenet-resnet-and-more-666091488df5
class LeNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(LeNetFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2, bias=False)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2, bias=False)
        self.norm2 = nn.BatchNorm2d(32)
        self.fc3 = nn.Linear(32 * 7 * 7, 64)
        self.norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.norm1(self.conv1(x))), 2)
        x = F.max_pool2d(F.relu(self.norm2(self.conv2(x))), 2)
        x = x.view(-1, 32 * 7 * 7)
        x = F.relu(self.norm3(self.fc3(x)))
        return x
    
feature_extractor = LeNetFeatureExtractor().cuda()

### Pretrain the feature extractor a bit

In [41]:
# Make a final classifier layer that operates on the feature extractor's output
classifier = nn.Linear(64, 10).cuda()
# Make list of parameters to optimize (both the parameters of the feature extractor and classifier)
params = list(feature_extractor.parameters()) + list(classifier.parameters())
# We train the network using stochastic gradient descent
optimizer = SGD(params, lr=0.1, momentum=0.9)

# Define our pretraining function
#    Set feature extractor to train mode (need b/c module unlike classifier which is just a single layer)
#    iterate through train_loader
#    put the data on the GPU as a variable
#    Zero out the gradients from/for back_prop (needed b/c otherwise would hurt RNNs by default)
#    Extract the 64-dimensional feature vector
#    Feed the features into the classifying layer and output the log softmax
#    Calculate negative log likelihood loss
#    COULD REPLACE ABOVE WITH torch.nn.functional.cross_entropy? Says it combines them
#    Backprop
#    Incrementally optimize parameters
#    Accumulate training loss
#    Print result of epoch
def pretrain(epoch):
    feature_extractor.train()
    train_loss = 0.
    for data, target in train_loader:
        #data, target = data.cuda(), target.cuda()
        #data, target = Variable(data), Variable(target)
        data, target = Variable(data.cuda()), Variable(target.cuda())
        optimizer.zero_grad()
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0] * len(data)
    print('Train Epoch: %d\tLoss: %.6f' % (epoch, train_loss / len(train_dataset)))

# Set feature extractor to eval mode (these should actually only effect Dropout and BatchNorm which we aren't?)
# http://pytorch.org/docs/master/nn.html#torch.nn.Module.train
# Set test_loss accumulator and correct counter
# Iterate through test data
#    volatile is something about not saving gradients because not needed in test mode? Basically just not
#          storing some type of information. Makes sense
#    calculate loss and accumulate
#    make prediction and check accuracy
def pretest():
    feature_extractor.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 3
for epoch in range(1, n_epochs + 1):
    pretrain(epoch)
    pretest()

Train Epoch: 1	Loss: 0.158651
Test set: Average loss: 0.0402, Accuracy: 9869/10000 (98.690%)
Train Epoch: 2	Loss: 0.038372
Test set: Average loss: 0.0329, Accuracy: 9890/10000 (98.900%)


KeyboardInterrupt: 

## Define the deep kernel GP

In [136]:
feature_extractor.conv1(Variable(train_dataset[0][0]).unsqueeze(0).cuda()).shape

n_rows = 16*28*28
n_cols = 28*28

In [163]:
from torch.nn import ConvTranspose2d
conv1_transpose = ConvTranspose2d(16, 1, kernel_size=5, padding=2, bias=False).cuda()
conv1_transpose.load_state_dict(feature_extractor.conv1.state_dict())

matmul_closure = lambda v: feature_extractor.conv1(Variable(v.contiguous().view(1, 1, 28, 28)).cuda()).data
matmul_t_closure = lambda v: conv1_transpose(Variable(v.contiguous().view(1, 16, 28, 28)).cuda()).data
v = train_dataset[0][0].view(28*28).cuda()

from gpytorch.utils.lanczos_bidiagonalize import LanczosBidiagonalize

In [183]:
from gpytorch.utils.lanczos_bidiagonalize import LanczosBidiagonalize
lb = LanczosBidiagonalize(max_iter=800, cls=type(v))
P, B, Q = lb.lanczos_bidiagonalize(matmul_closure, matmul_t_closure, v, n_rows, n_cols)

  return self.sub(other)


In [184]:
# Get output of "convolution" using Lanczos decomposition
my_out = P.matmul(B.matmul(Q.t())).matmul(train_dataset[1][0].view(28*28).cuda()).view(1, 16, 28, 28)
# Get output of convolution using actual convolutional layer forward
real_out = feature_extractor.conv1(Variable(train_dataset[1][0]).unsqueeze(0).cuda())

In [185]:
my_out[0, :2, :9, :9]


(0 ,.,.) = 
  0.5168  0.3948  0.1532  0.1532  0.1532  0.1532  0.1532  0.1532  0.1532
  0.7371  0.6906  0.4688  0.4688  0.4688  0.4688  0.4688  0.4688  0.4688
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.3846
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.3931 -0.2094

(1 ,.,.) = 
 -0.4860 -0.3472 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353
 -0.5565 -0.3557 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543
 -0.5038 -0.3589 -0.1084 -0.1084 -0.1084 -0.1085 -0.1085 -0.1085 -0.1084
 -0.5038 -0.3589 -0.1084 -0.1085 -0.1085 -0.1084 -0.1085 -0.1085 -0.1084
 -0.5038 -0.3589 -0.1084 

In [186]:
real_out[0, :2, :9, :9]

Variable containing:
(0 ,.,.) = 
  0.5168  0.3948  0.1532  0.1532  0.1532  0.1532  0.1532  0.1532  0.1532
  0.7372  0.6906  0.4688  0.4688  0.4688  0.4688  0.4688  0.4688  0.4688
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.5276  0.3846
  0.6974  0.7143  0.5276  0.5276  0.5276  0.5276  0.5276  0.3930 -0.2094

(1 ,.,.) = 
 -0.4860 -0.3472 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353 -0.1353
 -0.5565 -0.3557 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543 -0.0543
 -0.5038 -0.3589 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084
 -0.5038 -0.3589 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084 -0.1084
 -0.5

In [106]:
B


 4.2183  1.5955
 0.0000  4.2027
[torch.FloatTensor of size 2x2]