In [1]:
# Import our GPyTorch library
import gpytorch

# Import some classes we will use from torch
from torch.autograd import Variable
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader

In [2]:
# Import datasets to access MNISTS and transforms to format data for learning
from torchvision import transforms, datasets

# Download and load the MNIST dataset to train on
# Compose lets us do multiple transformations. Specically make the data a torch.FloatTensor of shape
# (colors x height x width) in the range [0.0, 1.0] as opposed to an RGB image with shape (height x width x colors)
# then normalize using  mean (0.1317) and standard deviation (0.3081) already calculated (not here)

# Transformation documentation here: http://pytorch.org/docs/master/torchvision/transforms.html
train_dataset = datasets.MNIST('/tmp', train=True, download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))
test_dataset = datasets.MNIST('/tmp', train=False, download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))
                              ]))

# But the data into a DataLoader. We shuffle the training data but not the test data because the order
# training data is presented will affect the outcome unlike the test data
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

## Define the feature extractor for our deep kernel

In [3]:
# Import torch's neural network
# Documentation here: http://pytorch.org/docs/master/nn.html
from torch import nn
# Import torch.nn.functional for various activation/pooling functions
# Documentation here: http://pytorch.org/docs/master/nn.html#torch-nn-functional
from torch.nn import functional as F

# We make a classic LeNet Architecture sans a final prediction layer to 10 outputs. This will serve as a feature
# extractor reducing the dimensionality of our data down to 64. We will pretrain these layers by adding on a 
# final classifying 64-->10 layer
# https://medium.com/@siddharthdas_32104/cnns-architectures-lenet-alexnet-vgg-googlenet-resnet-and-more-666091488df5
class LeNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(LeNetFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2)
        self.norm2 = nn.BatchNorm2d(32)
        self.fc3 = nn.Linear(32 * 7 * 7, 64)
        self.norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.norm1(self.conv1(x))), 2)
        x = F.max_pool2d(F.relu(self.norm2(self.conv2(x))), 2)
        x = x.view(-1, 32 * 7 * 7)
        x = F.relu(self.norm3(self.fc3(x)))
        return x
    
feature_extractor = LeNetFeatureExtractor().cuda()

### Pretrain the feature extractor a bit

In [4]:
# Make a final classifier layer that operates on the feature extractor's output
classifier = nn.Linear(64, 10).cuda()
# Make list of parameters to optimize (both the parameters of the feature extractor and classifier)
params = list(feature_extractor.parameters()) + list(classifier.parameters())
# We train the network using stochastic gradient descent
optimizer = SGD(params, lr=0.1, momentum=0.9)

# Define our pretraining function
#    Set feature extractor to train mode (need b/c module unlike classifier which is just a single layer)
#    iterate through train_loader
#    put the data on the GPU as a variable
#    Zero out the gradients from/for back_prop (needed b/c otherwise would hurt RNNs by default)
#    Extract the 64-dimensional feature vector
#    Feed the features into the classifying layer and output the log softmax
#    Calculate negative log likelihood loss
#    COULD REPLACE ABOVE WITH torch.nn.functional.cross_entropy? Says it combines them
#    Backprop
#    Incrementally optimize parameters
#    Accumulate training loss
#    Print result of epoch
def pretrain(epoch):
    feature_extractor.train()
    train_loss = 0.
    for data, target in train_loader:
        #data, target = data.cuda(), target.cuda()
        #data, target = Variable(data), Variable(target)
        data, target = Variable(data.cuda()), Variable(target.cuda())
        optimizer.zero_grad()
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0] * len(data)
    print('Train Epoch: %d\tLoss: %.6f' % (epoch, train_loss / len(train_dataset)))

# Set feature extractor to eval mode (these should actually only effect Dropout and BatchNorm which we aren't?)
# http://pytorch.org/docs/master/nn.html#torch.nn.Module.train
# Set test_loss accumulator and correct counter
# Iterate through test data
#    volatile is something about not saving gradients because not needed in test mode? Basically just not
#          storing some type of information. Makes sense
#    calculate loss and accumulate
#    make prediction and check accuracy
def pretest():
    feature_extractor.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 3
for epoch in range(1, n_epochs + 1):
    pretrain(epoch)
    pretest()

Train Epoch: 1	Loss: 0.149688
Test set: Average loss: 0.0541, Accuracy: 9827/10000 (98.270%)
Train Epoch: 2	Loss: 0.039770
Test set: Average loss: 0.0342, Accuracy: 9883/10000 (98.830%)
Train Epoch: 3	Loss: 0.026879
Test set: Average loss: 0.0361, Accuracy: 9879/10000 (98.790%)


## Define the deep kernel GP

In [5]:
# now this is our first exposure to the usefulness of gpytorch

# A gpytorch module is superclass of torch.nn.Module
class DKLModel(gpytorch.Module):
    def __init__(self, feature_extractor, n_features=64, grid_bounds=(-10., 10.)):
        super(DKLModel, self).__init__()
        # We add the feature-extracting network to the class
        self.feature_extractor = feature_extractor
        # The latent function is what transforms the features into the output
        self.latent_functions = LatentFunctions(n_features=n_features, grid_bounds=grid_bounds)
        # The grid bounds are the range we expect the features to fall into
        self.grid_bounds = grid_bounds
        # n_features in the dimension of the vector extracted (64)
        self.n_features = n_features
    
    def forward(self, x):
        # For the forward method of the Module, first feed the xdata through the
        # feature extraction network
        features = self.feature_extractor(x)
        # Scale to fit inside grid bounds
        features = gpytorch.utils.scale_to_bounds(features, self.grid_bounds[0], self.grid_bounds[1])
        # The result is hte output of the latent functions
        res = self.latent_functions(features.unsqueeze(-1))
        return res
    
# The AdditiveGridInducingVariationalGP trains multiple GPs on the features
# These are mixed together by the likelihoo function to generate the final
# classification output

# Grid bounds specify the allowed values of features
# grid_size is the number of subdivisions along each dimension
class LatentFunctions(gpytorch.models.AdditiveGridInducingVariationalGP):
    # n_features is the number of features from feature extractor
    # mixing params = False means the result of the GPs will simply be summed instead of mixed
    def __init__(self, n_features=64, grid_bounds=(-10., 10.), grid_size=128):
        super(LatentFunctions, self).__init__(grid_size=grid_size, grid_bounds=[grid_bounds],
                                              n_components=n_features, mixing_params=False, sum_output=False)
        #  We will use the very common universal approximator RBF Kernel
        cov_module = gpytorch.kernels.RBFKernel()
        # Initialize the lengthscale of the kernel
        cov_module.initialize(log_lengthscale=0)
        self.cov_module = cov_module
        self.grid_bounds = grid_bounds
        
    def forward(self, x):
        # Zero mean
        mean = Variable(x.data.new(len(x)).zero_())
        # Covariance using RBF kernel as described in __init__
        covar = self.cov_module(x)
        # Return as Gaussian
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)
    
# Intialize the model  
model = DKLModel(feature_extractor).cuda()
# Choose that likelihood function to use
# Here we use the softmax likelihood (e^z_i)/SUM_over_i(e^z_i)
# https://en.wikipedia.org/wiki/Softmax_function
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(n_features=model.n_features, n_classes=10).cuda()

In [6]:
# Simple DataLoader
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, pin_memory=True)
# We use an adam optimizer over both the model and likelihood parameters
# https://arxiv.org/abs/1412.6980
optimizer = Adam(list(model.parameters()) + list(likelihood.parameters()), lr=0.01)
#optimizer = Adam(list(model.parameters()), lr=0.01)

def train(epoch):
    model.train()
    likelihood.train()
    
    train_loss = 0.
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = -model.latent_functions.marginal_log_likelihood(likelihood, output, target, n_data=len(train_dataset))
        loss.backward()
        optimizer.step()
        print('Train Epoch: %d [%03d/%03d], Loss: %.6f' % (epoch, batch_idx + 1, len(train_loader), loss.data[0]))

def test():
    model.eval()
    likelihood.eval()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = likelihood(model(data))
        pred = output.argmax()
        correct += pred.eq(target.view_as(pred)).data.cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 10

# While we have theoretically fast algorithms for toeplitz matrix-vector multiplication, the hardware of GPUs
# is so well-designed that naive multiplication on them beats the current implementation of our algorith (despite
# theoretically fast computation). Because of this, we set the use_toeplitz flag to false to minimize runtime
with gpytorch.settings.use_toeplitz(False):
    for epoch in range(1, n_epochs + 1):
        %time train(epoch)
        test()

Train Epoch: 1 [001/030], Loss: 73.562729
Train Epoch: 1 [002/030], Loss: 82.663742
Train Epoch: 1 [003/030], Loss: 103.586403
Train Epoch: 1 [004/030], Loss: 94.326477
Train Epoch: 1 [005/030], Loss: 83.820274
Train Epoch: 1 [006/030], Loss: 51.902367
Train Epoch: 1 [007/030], Loss: 45.762814
Train Epoch: 1 [008/030], Loss: 120.406364
Train Epoch: 1 [009/030], Loss: 37.549706
Train Epoch: 1 [010/030], Loss: 42.237991
Train Epoch: 1 [011/030], Loss: 37.918449
Train Epoch: 1 [012/030], Loss: 31.445269
Train Epoch: 1 [013/030], Loss: 40.649334
Train Epoch: 1 [014/030], Loss: 27.964289
Train Epoch: 1 [015/030], Loss: 31.886467
Train Epoch: 1 [016/030], Loss: 20.399744
Train Epoch: 1 [017/030], Loss: 17.548803
Train Epoch: 1 [018/030], Loss: 24.627878
Train Epoch: 1 [019/030], Loss: 17.850117
Train Epoch: 1 [020/030], Loss: 25.801538
Train Epoch: 1 [021/030], Loss: 14.341015
Train Epoch: 1 [022/030], Loss: 11.081783
Train Epoch: 1 [023/030], Loss: 10.896550
Train Epoch: 1 [024/030], Loss: 

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 7345/10000 (73.450%)
Train Epoch: 2 [001/030], Loss: 7.326035
Train Epoch: 2 [002/030], Loss: 6.381141
Train Epoch: 2 [003/030], Loss: 18.014357
Train Epoch: 2 [004/030], Loss: 5.063754
Train Epoch: 2 [005/030], Loss: 3.934478
Train Epoch: 2 [006/030], Loss: 4.926157
Train Epoch: 2 [007/030], Loss: 4.167954
Train Epoch: 2 [008/030], Loss: 5.812820
Train Epoch: 2 [009/030], Loss: 3.497722
Train Epoch: 2 [010/030], Loss: 4.322648
Train Epoch: 2 [011/030], Loss: 4.075482
Train Epoch: 2 [012/030], Loss: 3.587610
Train Epoch: 2 [013/030], Loss: 3.207456
Train Epoch: 2 [014/030], Loss: 3.078331
Train Epoch: 2 [015/030], Loss: 4.152343
Train Epoch: 2 [016/030], Loss: 2.611198
Train Epoch: 2 [017/030], Loss: 3.961612
Train Epoch: 2 [018/030], Loss: 4.185015
Train Epoch: 2 [019/030], Loss: 7.462733
Train Epoch: 2 [020/030], Loss: 2.396472
Train Epoch: 2 [021/030], Loss: 3.037478
Train Epoch: 2 [022/030], Loss: 2.183641
Train Epoch: 2 [023/030], Loss: 1.

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9562/10000 (95.620%)
Train Epoch: 3 [001/030], Loss: 1.349056
Train Epoch: 3 [002/030], Loss: 1.561499
Train Epoch: 3 [003/030], Loss: 1.211915
Train Epoch: 3 [004/030], Loss: 1.286141
Train Epoch: 3 [005/030], Loss: 1.183094
Train Epoch: 3 [006/030], Loss: 1.949000
Train Epoch: 3 [007/030], Loss: 1.319652
Train Epoch: 3 [008/030], Loss: 1.486577
Train Epoch: 3 [009/030], Loss: 3.139071
Train Epoch: 3 [010/030], Loss: 1.188096
Train Epoch: 3 [011/030], Loss: 2.681830
Train Epoch: 3 [012/030], Loss: 0.863339
Train Epoch: 3 [013/030], Loss: 0.820401
Train Epoch: 3 [014/030], Loss: 1.016593
Train Epoch: 3 [015/030], Loss: 0.746513
Train Epoch: 3 [016/030], Loss: 2.122018
Train Epoch: 3 [017/030], Loss: 0.788027
Train Epoch: 3 [018/030], Loss: 1.953751
Train Epoch: 3 [019/030], Loss: 0.791373
Train Epoch: 3 [020/030], Loss: 0.695260
Train Epoch: 3 [021/030], Loss: 0.721512
Train Epoch: 3 [022/030], Loss: 0.705809
Train Epoch: 3 [023/030], Loss: 0.6

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9770/10000 (97.700%)
Train Epoch: 4 [001/030], Loss: 0.536258
Train Epoch: 4 [002/030], Loss: 0.472667
Train Epoch: 4 [003/030], Loss: 0.508217
Train Epoch: 4 [004/030], Loss: 0.536780
Train Epoch: 4 [005/030], Loss: 0.437637
Train Epoch: 4 [006/030], Loss: 0.430580
Train Epoch: 4 [007/030], Loss: 0.445569
Train Epoch: 4 [008/030], Loss: 0.404310
Train Epoch: 4 [009/030], Loss: 0.479061
Train Epoch: 4 [010/030], Loss: 0.540917
Train Epoch: 4 [011/030], Loss: 0.416310
Train Epoch: 4 [012/030], Loss: 0.404040
Train Epoch: 4 [013/030], Loss: 0.374272
Train Epoch: 4 [014/030], Loss: 0.398496
Train Epoch: 4 [015/030], Loss: 0.385793
Train Epoch: 4 [016/030], Loss: 0.656962
Train Epoch: 4 [017/030], Loss: 0.567622
Train Epoch: 4 [018/030], Loss: 0.400512
Train Epoch: 4 [019/030], Loss: 0.356935
Train Epoch: 4 [020/030], Loss: 0.687547
Train Epoch: 4 [021/030], Loss: 0.330172
Train Epoch: 4 [022/030], Loss: 0.332320
Train Epoch: 4 [023/030], Loss: 0.3

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9869/10000 (98.690%)
Train Epoch: 5 [001/030], Loss: 0.296992
Train Epoch: 5 [002/030], Loss: 0.328858
Train Epoch: 5 [003/030], Loss: 0.266479
Train Epoch: 5 [004/030], Loss: 0.297663
Train Epoch: 5 [005/030], Loss: 0.332768
Train Epoch: 5 [006/030], Loss: 0.247213
Train Epoch: 5 [007/030], Loss: 0.217652
Train Epoch: 5 [008/030], Loss: 0.325998
Train Epoch: 5 [009/030], Loss: 0.273159
Train Epoch: 5 [010/030], Loss: 0.255474
Train Epoch: 5 [011/030], Loss: 0.314953
Train Epoch: 5 [012/030], Loss: 0.257866
Train Epoch: 5 [013/030], Loss: 0.212163
Train Epoch: 5 [014/030], Loss: 0.225794
Train Epoch: 5 [015/030], Loss: 0.247043
Train Epoch: 5 [016/030], Loss: 0.221152
Train Epoch: 5 [017/030], Loss: 0.259933
Train Epoch: 5 [018/030], Loss: 0.221741
Train Epoch: 5 [019/030], Loss: 0.239763
Train Epoch: 5 [020/030], Loss: 0.188205
Train Epoch: 5 [021/030], Loss: 0.214881
Train Epoch: 5 [022/030], Loss: 0.250654
Train Epoch: 5 [023/030], Loss: 0.2

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9876/10000 (98.760%)
Train Epoch: 6 [001/030], Loss: 0.208963
Train Epoch: 6 [002/030], Loss: 0.200370
Train Epoch: 6 [003/030], Loss: 0.177837
Train Epoch: 6 [004/030], Loss: 0.212964
Train Epoch: 6 [005/030], Loss: 0.204766
Train Epoch: 6 [006/030], Loss: 0.167096
Train Epoch: 6 [007/030], Loss: 0.265125
Train Epoch: 6 [008/030], Loss: 0.196995
Train Epoch: 6 [009/030], Loss: 0.207403
Train Epoch: 6 [010/030], Loss: 0.141273
Train Epoch: 6 [011/030], Loss: 0.154239
Train Epoch: 6 [012/030], Loss: 0.170697
Train Epoch: 6 [013/030], Loss: 0.153261
Train Epoch: 6 [014/030], Loss: 0.189697
Train Epoch: 6 [015/030], Loss: 0.139502
Train Epoch: 6 [016/030], Loss: 0.136670
Train Epoch: 6 [017/030], Loss: 0.113855
Train Epoch: 6 [018/030], Loss: 0.184707
Train Epoch: 6 [019/030], Loss: 0.198732
Train Epoch: 6 [020/030], Loss: 0.162072
Train Epoch: 6 [021/030], Loss: 0.129305
Train Epoch: 6 [022/030], Loss: 0.305793
Train Epoch: 6 [023/030], Loss: 0.1

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9903/10000 (99.030%)
Train Epoch: 7 [001/030], Loss: 0.112920
Train Epoch: 7 [002/030], Loss: 0.130736
Train Epoch: 7 [003/030], Loss: 0.144949
Train Epoch: 7 [004/030], Loss: 0.148146
Train Epoch: 7 [005/030], Loss: 0.125655
Train Epoch: 7 [006/030], Loss: 0.113538
Train Epoch: 7 [007/030], Loss: 0.329770
Train Epoch: 7 [008/030], Loss: 0.126772
Train Epoch: 7 [009/030], Loss: 0.106565
Train Epoch: 7 [010/030], Loss: 0.126519
Train Epoch: 7 [011/030], Loss: 0.109528
Train Epoch: 7 [012/030], Loss: 0.101979
Train Epoch: 7 [013/030], Loss: 0.142129
Train Epoch: 7 [014/030], Loss: 0.104333
Train Epoch: 7 [015/030], Loss: 0.128468
Train Epoch: 7 [016/030], Loss: 0.227582
Train Epoch: 7 [017/030], Loss: 0.120418
Train Epoch: 7 [018/030], Loss: 0.544133
Train Epoch: 7 [019/030], Loss: 0.138761
Train Epoch: 7 [020/030], Loss: 0.065223
Train Epoch: 7 [021/030], Loss: 0.084803
Train Epoch: 7 [022/030], Loss: 0.096719
Train Epoch: 7 [023/030], Loss: 0.0

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9911/10000 (99.110%)
Train Epoch: 8 [001/030], Loss: 0.100763
Train Epoch: 8 [002/030], Loss: 0.063610
Train Epoch: 8 [003/030], Loss: 0.091145
Train Epoch: 8 [004/030], Loss: 0.097637
Train Epoch: 8 [005/030], Loss: 0.124313
Train Epoch: 8 [006/030], Loss: 0.076175
Train Epoch: 8 [007/030], Loss: 0.057131
Train Epoch: 8 [008/030], Loss: 0.074120
Train Epoch: 8 [009/030], Loss: 0.086776
Train Epoch: 8 [010/030], Loss: 0.057881
Train Epoch: 8 [011/030], Loss: 0.062754
Train Epoch: 8 [012/030], Loss: 0.089559
Train Epoch: 8 [013/030], Loss: 0.068019
Train Epoch: 8 [014/030], Loss: 0.045410
Train Epoch: 8 [015/030], Loss: 0.065038
Train Epoch: 8 [016/030], Loss: 0.173387
Train Epoch: 8 [017/030], Loss: 0.063684
Train Epoch: 8 [018/030], Loss: 0.105259
Train Epoch: 8 [019/030], Loss: 0.029468
Train Epoch: 8 [020/030], Loss: 0.109944
Train Epoch: 8 [021/030], Loss: 0.097185
Train Epoch: 8 [022/030], Loss: 0.083691
Train Epoch: 8 [023/030], Loss: 0.0

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9907/10000 (99.070%)
Train Epoch: 9 [001/030], Loss: 0.067182
Train Epoch: 9 [002/030], Loss: 0.070725
Train Epoch: 9 [003/030], Loss: 0.051012
Train Epoch: 9 [004/030], Loss: 0.051039
Train Epoch: 9 [005/030], Loss: 0.041201
Train Epoch: 9 [006/030], Loss: 0.056210
Train Epoch: 9 [007/030], Loss: 0.065750
Train Epoch: 9 [008/030], Loss: 0.040785
Train Epoch: 9 [009/030], Loss: 0.044801
Train Epoch: 9 [010/030], Loss: 0.014538
Train Epoch: 9 [011/030], Loss: 0.044146
Train Epoch: 9 [012/030], Loss: 0.042313
Train Epoch: 9 [013/030], Loss: 0.067476
Train Epoch: 9 [014/030], Loss: 0.030909
Train Epoch: 9 [015/030], Loss: 0.037886
Train Epoch: 9 [016/030], Loss: 0.060732
Train Epoch: 9 [017/030], Loss: 0.039009
Train Epoch: 9 [018/030], Loss: 0.024978
Train Epoch: 9 [019/030], Loss: 0.125011
Train Epoch: 9 [020/030], Loss: 0.048340
Train Epoch: 9 [021/030], Loss: 0.058975
Train Epoch: 9 [022/030], Loss: 0.065850
Train Epoch: 9 [023/030], Loss: -0.

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9907/10000 (99.070%)
Train Epoch: 10 [001/030], Loss: 0.044148
Train Epoch: 10 [002/030], Loss: 0.032647
Train Epoch: 10 [003/030], Loss: 0.084709
Train Epoch: 10 [004/030], Loss: 0.034030
Train Epoch: 10 [005/030], Loss: 0.037789
Train Epoch: 10 [006/030], Loss: 0.018861
Train Epoch: 10 [007/030], Loss: 0.041112
Train Epoch: 10 [008/030], Loss: 0.029310
Train Epoch: 10 [009/030], Loss: 0.056010
Train Epoch: 10 [010/030], Loss: 0.009141
Train Epoch: 10 [011/030], Loss: 0.021309
Train Epoch: 10 [012/030], Loss: 0.031721
Train Epoch: 10 [013/030], Loss: 0.032702
Train Epoch: 10 [014/030], Loss: 0.010268
Train Epoch: 10 [015/030], Loss: 0.019758
Train Epoch: 10 [016/030], Loss: 0.042157
Train Epoch: 10 [017/030], Loss: 0.031368
Train Epoch: 10 [018/030], Loss: 0.032619
Train Epoch: 10 [019/030], Loss: -0.005643
Train Epoch: 10 [020/030], Loss: 0.040154
Train Epoch: 10 [021/030], Loss: 0.014939
Train Epoch: 10 [022/030], Loss: 0.003576
Train Epoch:

  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Test set: Average loss: 0.0000, Accuracy: 9905/10000 (99.050%)
