This example shows how to use deep kernel learning (DKL) for classification. This is useful when you have very complex high-dimensional inputs (such as an image)

The example here is MNIST classification

For an introduction to DKL see these papers:
https://arxiv.org/abs/1511.02222
https://arxiv.org/abs/1611.00336

In [1]:
# Import our GPyTorch library
import gpytorch

# Import some classes we will use from torch
from torch.autograd import Variable
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader

In [2]:
# Import datasets to access MNISTS and transforms to format data for learning
from torchvision import transforms, datasets

# Download and load the MNIST dataset to train on
# Compose lets us do multiple transformations. Specically make the data a torch.FloatTensor of shape
# (colors x height x width) in the range [0.0, 1.0] as opposed to an RGB image with shape (height x width x colors)
# then normalize using  mean (0.1317) and standard deviation (0.3081) already calculated (not here)

# Transformation documentation here: http://pytorch.org/docs/master/torchvision/transforms.html
train_dataset = datasets.MNIST('/tmp', train=True, download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))
test_dataset = datasets.MNIST('/tmp', train=False, download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))
                              ]))

# But the data into a DataLoader. We shuffle the training data but not the test data because the order
# training data is presented will affect the outcome unlike the test data
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

## Define the feature extractor for our deep kernel

In [3]:
# Import torch's neural network
# Documentation here: http://pytorch.org/docs/master/nn.html
from torch import nn
# Import torch.nn.functional for various activation/pooling functions
# Documentation here: http://pytorch.org/docs/master/nn.html#torch-nn-functional
from torch.nn import functional as F

# We make a classic LeNet Architecture sans a final prediction layer to 10 outputs. This will serve as a feature
# extractor reducing the dimensionality of our data down to 64. We will pretrain these layers by adding on a 
# final classifying 64-->10 layer
# https://medium.com/@siddharthdas_32104/cnns-architectures-lenet-alexnet-vgg-googlenet-resnet-and-more-666091488df5
class LeNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(LeNetFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2)
        self.norm2 = nn.BatchNorm2d(32)
        self.fc3 = nn.Linear(32 * 7 * 7, 64)
        self.norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.norm1(self.conv1(x))), 2)
        x = F.max_pool2d(F.relu(self.norm2(self.conv2(x))), 2)
        x = x.view(-1, 32 * 7 * 7)
        x = F.relu(self.norm3(self.fc3(x)))
        return x
    
feature_extractor = LeNetFeatureExtractor().cuda()

### Pretrain the feature extractor a bit

In [4]:
# Make a final classifier layer that operates on the feature extractor's output
classifier = nn.Linear(64, 10).cuda()
# Make list of parameters to optimize (both the parameters of the feature extractor and classifier)
params = list(feature_extractor.parameters()) + list(classifier.parameters())
# We train the network using stochastic gradient descent
optimizer = SGD(params, lr=0.1, momentum=0.9)

# Define our pretraining function
#    Set feature extractor to train mode (need b/c module unlike classifier which is just a single layer)
#    iterate through train_loader
#    put the data on the GPU as a variable
#    Zero out the gradients from/for back_prop (needed b/c otherwise would hurt RNNs by default)
#    Extract the 64-dimensional feature vector
#    Feed the features into the classifying layer and output the log softmax
#    Calculate negative log likelihood loss
#    COULD REPLACE ABOVE WITH torch.nn.functional.cross_entropy? Says it combines them
#    Backprop
#    Incrementally optimize parameters
#    Accumulate training loss
#    Print result of epoch
def pretrain(epoch):
    feature_extractor.train()
    train_loss = 0.
    for data, target in train_loader:
        #data, target = data.cuda(), target.cuda()
        #data, target = Variable(data), Variable(target)
        data, target = Variable(data.cuda()), Variable(target.cuda())
        optimizer.zero_grad()
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0] * len(data)
    print('Train Epoch: %d\tLoss: %.6f' % (epoch, train_loss / len(train_dataset)))

# Set feature extractor to eval mode (these should actually only effect Dropout and BatchNorm which we aren't?)
# http://pytorch.org/docs/master/nn.html#torch.nn.Module.train
# Set test_loss accumulator and correct counter
# Iterate through test data
#    volatile is something about not saving gradients because not needed in test mode? Basically just not
#          storing some type of information. Makes sense
#    calculate loss and accumulate
#    make prediction and check accuracy
def pretest():
    feature_extractor.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 3
for epoch in range(1, n_epochs + 1):
    pretrain(epoch)
    pretest()

Train Epoch: 1	Loss: 0.150723
Test set: Average loss: 0.0457, Accuracy: 9851/10000 (98.510%)
Train Epoch: 2	Loss: 0.035508
Test set: Average loss: 0.0292, Accuracy: 9900/10000 (99.000%)
Train Epoch: 3	Loss: 0.024795
Test set: Average loss: 0.0406, Accuracy: 9877/10000 (98.770%)


## Define the deep kernel GP

In [5]:
# now this is our first exposure to the usefulness of gpytorch

# A gpytorch module is superclass of torch.nn.Module
class DKLModel(gpytorch.Module):
    def __init__(self, feature_extractor, n_features=64, grid_bounds=(-10., 10.)):
        super(DKLModel, self).__init__()
        # We add the feature-extracting network to the class
        self.feature_extractor = feature_extractor
        # The latent function is what transforms the features into the output
        self.latent_functions = LatentFunctions(n_features=n_features, grid_bounds=grid_bounds)
        # The grid bounds are the range we expect the features to fall into
        self.grid_bounds = grid_bounds
        # n_features in the dimension of the vector extracted (64)
        self.n_features = n_features
    
    def forward(self, x):
        # For the forward method of the Module, first feed the xdata through the
        # feature extraction network
        features = self.feature_extractor(x)
        # Scale to fit inside grid bounds
        features = gpytorch.utils.scale_to_bounds(features, self.grid_bounds[0], self.grid_bounds[1])
        # The result is hte output of the latent functions
        res = self.latent_functions(features.unsqueeze(-1))
        return res
    
# The AdditiveGridInducingVariationalGP trains multiple GPs on the features
# These are mixed together by the likelihoo function to generate the final
# classification output

# Grid bounds specify the allowed values of features
# grid_size is the number of subdivisions along each dimension
class LatentFunctions(gpytorch.models.AdditiveGridInducingVariationalGP):
    # n_features is the number of features from feature extractor
    # mixing params = False means the result of the GPs will simply be summed instead of mixed
    def __init__(self, n_features=64, grid_bounds=(-10., 10.), grid_size=128):
        super(LatentFunctions, self).__init__(grid_size=grid_size, grid_bounds=[grid_bounds],
                                              n_components=n_features, mixing_params=False, sum_output=False)
        #  We will use the very common universal approximator RBF Kernel
        cov_module = gpytorch.kernels.RBFKernel()
        # Initialize the lengthscale of the kernel
        cov_module.initialize(log_lengthscale=0)
        self.cov_module = cov_module
        self.grid_bounds = grid_bounds
        
    def forward(self, x):
        # Zero mean
        mean = Variable(x.data.new(len(x)).zero_())
        # Covariance using RBF kernel as described in __init__
        covar = self.cov_module(x)
        # Return as Gaussian
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)
    
# Intialize the model  
model = DKLModel(feature_extractor).cuda()
# Choose that likelihood function to use
# Here we use the softmax likelihood (e^z_i)/SUM_over_i(e^z_i)
# https://en.wikipedia.org/wiki/Softmax_function
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(n_features=model.n_features, n_classes=10).cuda()

In [7]:
# Simple DataLoader
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, pin_memory=True)

# We use an adam optimizer over both the model and likelihood parameters
# https://arxiv.org/abs/1412.6980
optimizer = Adam([
    {'params': model.parameters()},
    {'params': likelihood.parameters()},  # SoftmaxLikelihood contains parameters
], lr=0.01)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.VariationalMarginalLogLikelihood(likelihood, model, n_data=len(train_dataset))

def train(epoch):
    model.train()
    likelihood.train()
    
    train_loss = 0.
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = -mll(output, target)
        loss.backward()
        optimizer.step()
        print('Train Epoch: %d [%03d/%03d], Loss: %.6f' % (epoch, batch_idx + 1, len(train_loader), loss.data[0]))

def test():
    model.eval()
    likelihood.eval()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = likelihood(model(data))
        pred = output.argmax()
        correct += pred.eq(target.view_as(pred)).data.cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 10

# While we have theoretically fast algorithms for toeplitz matrix-vector multiplication, the hardware of GPUs
# is so well-designed that naive multiplication on them beats the current implementation of our algorith (despite
# theoretically fast computation). Because of this, we set the use_toeplitz flag to false to minimize runtime
with gpytorch.settings.use_toeplitz(False):
    for epoch in range(1, n_epochs + 1):
        %time train(epoch)
        test()

Train Epoch: 1 [001/030], Loss: 36.142605
Train Epoch: 1 [002/030], Loss: 36.090061
Train Epoch: 1 [003/030], Loss: 35.627544
Train Epoch: 1 [004/030], Loss: 34.603943
Train Epoch: 1 [005/030], Loss: 33.154297
Train Epoch: 1 [006/030], Loss: 31.405090
Train Epoch: 1 [007/030], Loss: 29.622997
Train Epoch: 1 [008/030], Loss: 27.900564
Train Epoch: 1 [009/030], Loss: 26.234581
Train Epoch: 1 [010/030], Loss: 24.712194
Train Epoch: 1 [011/030], Loss: 23.318537
Train Epoch: 1 [012/030], Loss: 22.062542
Train Epoch: 1 [013/030], Loss: 20.924089
Train Epoch: 1 [014/030], Loss: 19.880186
Train Epoch: 1 [015/030], Loss: 18.957228
Train Epoch: 1 [016/030], Loss: 18.153137
Train Epoch: 1 [017/030], Loss: 17.367653
Train Epoch: 1 [018/030], Loss: 16.663593
Train Epoch: 1 [019/030], Loss: 16.012274
Train Epoch: 1 [020/030], Loss: 15.308195
Train Epoch: 1 [021/030], Loss: 14.672304
Train Epoch: 1 [022/030], Loss: 14.030581
Train Epoch: 1 [023/030], Loss: 13.444436
Train Epoch: 1 [024/030], Loss: 12

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 6121/10000 (61.210%)
Train Epoch: 2 [001/030], Loss: 9.258375
Train Epoch: 2 [002/030], Loss: 8.867882
Train Epoch: 2 [003/030], Loss: 8.495071
Train Epoch: 2 [004/030], Loss: 8.101971
Train Epoch: 2 [005/030], Loss: 7.802680
Train Epoch: 2 [006/030], Loss: 7.402121
Train Epoch: 2 [007/030], Loss: 7.032426
Train Epoch: 2 [008/030], Loss: 6.643226
Train Epoch: 2 [009/030], Loss: 6.348939
Train Epoch: 2 [010/030], Loss: 6.104055
Train Epoch: 2 [011/030], Loss: 5.998545
Train Epoch: 2 [012/030], Loss: 5.626659
Train Epoch: 2 [013/030], Loss: 5.302618
Train Epoch: 2 [014/030], Loss: 5.285032
Train Epoch: 2 [015/030], Loss: 4.887465
Train Epoch: 2 [016/030], Loss: 4.772480
Train Epoch: 2 [017/030], Loss: 4.505405
Train Epoch: 2 [018/030], Loss: 4.331192
Train Epoch: 2 [019/030], Loss: 4.121179
Train Epoch: 2 [020/030], Loss: 4.084509
Train Epoch: 2 [021/030], Loss: 3.933098
Train Epoch: 2 [022/030], Loss: 3.813060
Train Epoch: 2 [023/030], Loss: 3.6

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9703/10000 (97.030%)
Train Epoch: 3 [001/030], Loss: 2.849145
Train Epoch: 3 [002/030], Loss: 2.737780
Train Epoch: 3 [003/030], Loss: 2.630193
Train Epoch: 3 [004/030], Loss: 2.631416
Train Epoch: 3 [005/030], Loss: 2.499823
Train Epoch: 3 [006/030], Loss: 2.418246
Train Epoch: 3 [007/030], Loss: 2.367687
Train Epoch: 3 [008/030], Loss: 2.339429
Train Epoch: 3 [009/030], Loss: 2.255776
Train Epoch: 3 [010/030], Loss: 2.225314
Train Epoch: 3 [011/030], Loss: 2.154285
Train Epoch: 3 [012/030], Loss: 2.085375
Train Epoch: 3 [013/030], Loss: 2.034397
Train Epoch: 3 [014/030], Loss: 2.007726
Train Epoch: 3 [015/030], Loss: 1.919988
Train Epoch: 3 [016/030], Loss: 1.904859
Train Epoch: 3 [017/030], Loss: 1.878610
Train Epoch: 3 [018/030], Loss: 1.832925
Train Epoch: 3 [019/030], Loss: 1.773328
Train Epoch: 3 [020/030], Loss: 1.738720
Train Epoch: 3 [021/030], Loss: 1.738039
Train Epoch: 3 [022/030], Loss: 1.649673
Train Epoch: 3 [023/030], Loss: 1.6

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9763/10000 (97.630%)
Train Epoch: 4 [001/030], Loss: 1.402782
Train Epoch: 4 [002/030], Loss: 1.380410
Train Epoch: 4 [003/030], Loss: 1.351071
Train Epoch: 4 [004/030], Loss: 1.345011
Train Epoch: 4 [005/030], Loss: 1.307389
Train Epoch: 4 [006/030], Loss: 1.262983
Train Epoch: 4 [007/030], Loss: 1.260225
Train Epoch: 4 [008/030], Loss: 1.252252
Train Epoch: 4 [009/030], Loss: 1.243247
Train Epoch: 4 [010/030], Loss: 1.212812
Train Epoch: 4 [011/030], Loss: 1.200291
Train Epoch: 4 [012/030], Loss: 1.171254
Train Epoch: 4 [013/030], Loss: 1.166869
Train Epoch: 4 [014/030], Loss: 1.120746
Train Epoch: 4 [015/030], Loss: 1.121987
Train Epoch: 4 [016/030], Loss: 1.094936
Train Epoch: 4 [017/030], Loss: 1.086679
Train Epoch: 4 [018/030], Loss: 1.078507
Train Epoch: 4 [019/030], Loss: 1.057760
Train Epoch: 4 [020/030], Loss: 1.040013
Train Epoch: 4 [021/030], Loss: 1.033480
Train Epoch: 4 [022/030], Loss: 1.021665
Train Epoch: 4 [023/030], Loss: 0.9

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9864/10000 (98.640%)
Train Epoch: 5 [001/030], Loss: 0.923116
Train Epoch: 5 [002/030], Loss: 0.891815
Train Epoch: 5 [003/030], Loss: 0.884920
Train Epoch: 5 [004/030], Loss: 0.865727
Train Epoch: 5 [005/030], Loss: 0.879008
Train Epoch: 5 [006/030], Loss: 0.859716
Train Epoch: 5 [007/030], Loss: 0.857543
Train Epoch: 5 [008/030], Loss: 0.835173
Train Epoch: 5 [009/030], Loss: 0.836762
Train Epoch: 5 [010/030], Loss: 0.826224
Train Epoch: 5 [011/030], Loss: 0.811641
Train Epoch: 5 [012/030], Loss: 0.802732
Train Epoch: 5 [013/030], Loss: 0.787749
Train Epoch: 5 [014/030], Loss: 0.791700
Train Epoch: 5 [015/030], Loss: 0.779832
Train Epoch: 5 [016/030], Loss: 0.778598
Train Epoch: 5 [017/030], Loss: 0.757305
Train Epoch: 5 [018/030], Loss: 0.766895
Train Epoch: 5 [019/030], Loss: 0.742244
Train Epoch: 5 [020/030], Loss: 0.752526
Train Epoch: 5 [021/030], Loss: 0.724963
Train Epoch: 5 [022/030], Loss: 0.737638
Train Epoch: 5 [023/030], Loss: 0.7

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9901/10000 (99.010%)
Train Epoch: 6 [001/030], Loss: 0.677170
Train Epoch: 6 [002/030], Loss: 0.667035
Train Epoch: 6 [003/030], Loss: 0.658754
Train Epoch: 6 [004/030], Loss: 0.659502
Train Epoch: 6 [005/030], Loss: 0.648770
Train Epoch: 6 [006/030], Loss: 0.646387
Train Epoch: 6 [007/030], Loss: 0.640344
Train Epoch: 6 [008/030], Loss: 0.641842
Train Epoch: 6 [009/030], Loss: 0.626409
Train Epoch: 6 [010/030], Loss: 0.628503
Train Epoch: 6 [011/030], Loss: 0.624434
Train Epoch: 6 [012/030], Loss: 0.614327
Train Epoch: 6 [013/030], Loss: 0.616552
Train Epoch: 6 [014/030], Loss: 0.607694
Train Epoch: 6 [015/030], Loss: 0.599544
Train Epoch: 6 [016/030], Loss: 0.593831
Train Epoch: 6 [017/030], Loss: 0.602182
Train Epoch: 6 [018/030], Loss: 0.608630
Train Epoch: 6 [019/030], Loss: 0.576198
Train Epoch: 6 [020/030], Loss: 0.600555
Train Epoch: 6 [021/030], Loss: 0.587777
Train Epoch: 6 [022/030], Loss: 0.576071
Train Epoch: 6 [023/030], Loss: 0.5

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9909/10000 (99.090%)
Train Epoch: 7 [001/030], Loss: 0.539820
Train Epoch: 7 [002/030], Loss: 0.541358
Train Epoch: 7 [003/030], Loss: 0.549363
Train Epoch: 7 [004/030], Loss: 0.523764
Train Epoch: 7 [005/030], Loss: 0.525894
Train Epoch: 7 [006/030], Loss: 0.515665
Train Epoch: 7 [007/030], Loss: 0.524350
Train Epoch: 7 [008/030], Loss: 0.523155
Train Epoch: 7 [009/030], Loss: 0.514502
Train Epoch: 7 [010/030], Loss: 0.520955
Train Epoch: 7 [011/030], Loss: 0.515079
Train Epoch: 7 [012/030], Loss: 0.517062
Train Epoch: 7 [013/030], Loss: 0.502428
Train Epoch: 7 [014/030], Loss: 0.505979
Train Epoch: 7 [015/030], Loss: 0.507256
Train Epoch: 7 [016/030], Loss: 0.500486
Train Epoch: 7 [017/030], Loss: 0.511843
Train Epoch: 7 [018/030], Loss: 0.504095
Train Epoch: 7 [019/030], Loss: 0.490187
Train Epoch: 7 [020/030], Loss: 0.489616
Train Epoch: 7 [021/030], Loss: 0.493186
Train Epoch: 7 [022/030], Loss: 0.478395
Train Epoch: 7 [023/030], Loss: 0.4

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9918/10000 (99.180%)
Train Epoch: 8 [001/030], Loss: 0.467670
Train Epoch: 8 [002/030], Loss: 0.455673
Train Epoch: 8 [003/030], Loss: 0.458428
Train Epoch: 8 [004/030], Loss: 0.459142
Train Epoch: 8 [005/030], Loss: 0.457363
Train Epoch: 8 [006/030], Loss: 0.451025
Train Epoch: 8 [007/030], Loss: 0.452753
Train Epoch: 8 [008/030], Loss: 0.464538
Train Epoch: 8 [009/030], Loss: 0.445390
Train Epoch: 8 [010/030], Loss: 0.446881
Train Epoch: 8 [011/030], Loss: 0.447128
Train Epoch: 8 [012/030], Loss: 0.443721
Train Epoch: 8 [013/030], Loss: 0.441874
Train Epoch: 8 [014/030], Loss: 0.445817
Train Epoch: 8 [015/030], Loss: 0.441609
Train Epoch: 8 [016/030], Loss: 0.443518
Train Epoch: 8 [017/030], Loss: 0.435089
Train Epoch: 8 [018/030], Loss: 0.451810
Train Epoch: 8 [019/030], Loss: 0.430029
Train Epoch: 8 [020/030], Loss: 0.434117
Train Epoch: 8 [021/030], Loss: 0.426290
Train Epoch: 8 [022/030], Loss: 0.423751
Train Epoch: 8 [023/030], Loss: 0.4

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9912/10000 (99.120%)
Train Epoch: 9 [001/030], Loss: 0.418398
Train Epoch: 9 [002/030], Loss: 0.408011
Train Epoch: 9 [003/030], Loss: 0.415424
Train Epoch: 9 [004/030], Loss: 0.402562
Train Epoch: 9 [005/030], Loss: 0.406093
Train Epoch: 9 [006/030], Loss: 0.403615
Train Epoch: 9 [007/030], Loss: 0.402306
Train Epoch: 9 [008/030], Loss: 0.413138
Train Epoch: 9 [009/030], Loss: 0.404397
Train Epoch: 9 [010/030], Loss: 0.398019
Train Epoch: 9 [011/030], Loss: 0.407999
Train Epoch: 9 [012/030], Loss: 0.397591
Train Epoch: 9 [013/030], Loss: 0.396837
Train Epoch: 9 [014/030], Loss: 0.402960
Train Epoch: 9 [015/030], Loss: 0.402641
Train Epoch: 9 [016/030], Loss: 0.394820
Train Epoch: 9 [017/030], Loss: 0.404263
Train Epoch: 9 [018/030], Loss: 0.389264
Train Epoch: 9 [019/030], Loss: 0.386715
Train Epoch: 9 [020/030], Loss: 0.396539
Train Epoch: 9 [021/030], Loss: 0.388339
Train Epoch: 9 [022/030], Loss: 0.405376
Train Epoch: 9 [023/030], Loss: 0.3

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9917/10000 (99.170%)
Train Epoch: 10 [001/030], Loss: 0.381225
Train Epoch: 10 [002/030], Loss: 0.372451
Train Epoch: 10 [003/030], Loss: 0.374265
Train Epoch: 10 [004/030], Loss: 0.374987
Train Epoch: 10 [005/030], Loss: 0.372894
Train Epoch: 10 [006/030], Loss: 0.368853
Train Epoch: 10 [007/030], Loss: 0.369897
Train Epoch: 10 [008/030], Loss: 0.374589
Train Epoch: 10 [009/030], Loss: 0.370186
Train Epoch: 10 [010/030], Loss: 0.367384
Train Epoch: 10 [011/030], Loss: 0.367410
Train Epoch: 10 [012/030], Loss: 0.370448
Train Epoch: 10 [013/030], Loss: 0.371703
Train Epoch: 10 [014/030], Loss: 0.371174
Train Epoch: 10 [015/030], Loss: 0.362868
Train Epoch: 10 [016/030], Loss: 0.360119
Train Epoch: 10 [017/030], Loss: 0.363455
Train Epoch: 10 [018/030], Loss: 0.363073
Train Epoch: 10 [019/030], Loss: 0.359111
Train Epoch: 10 [020/030], Loss: 0.363191
Train Epoch: 10 [021/030], Loss: 0.364174
Train Epoch: 10 [022/030], Loss: 0.367098
Train Epoch: 

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9923/10000 (99.230%)
