In [1]:
import torch
import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import os

from model.net import *
from utils.training import *
from data.data import *

## Experiment Config

In [2]:
model_path = os.getcwd()

args = {
    'USE_CUDA': True if torch.cuda.is_available() else False,
    'BATCH_SIZE': 256,
    'N_EPOCHS': 30,
    'LEARNING_RATE': 1e-2,
    'MOMENTUM': 0.9,
    'DATASET_NAME':'mnist',
    'LAMBDA_recon': 0.5, #0.0005
    'LAMBDA_margin': 0,
    'LReLU_negative_slope':0.1,
    'WEIGHTDECAY':0,#5e-3,
}

## Model Loading

In [3]:
#Config for 49 16d vectors in the Primary Capsule. Set Softmax dimension to 0 in this case
class Config:
    def __init__(self):
        # CNN (cnn)
        self.cnn_in_channels = 1
        self.cnn_out_channels = 12
        self.cnn_kernel_size = 15

        # Primary Capsule (pc)
        self.pc_num_capsules = 16
        self.pc_in_channels = 12
        self.pc_out_channels = 1
        self.pc_kernel_size = 8
        self.pc_num_routes = 1 * 7 * 7

        # Digit Capsule 1 (dc)
        self.dc_num_capsules = 49
        self.dc_num_routes = 1 * 7 * 7
        self.dc_in_channels = 16
        self.dc_out_channels = 16
        
        # Digit Capsule 2 (dc)
        self.dc_2_num_capsules = 10
        self.dc_2_num_routes = 1 * 7 * 7
        self.dc_2_in_channels = 16
        self.dc_2_out_channels = 16

        # Decoder
        self.input_width = 28
        self.input_height = 28

torch.manual_seed(1)
config = Config()

net = CapsNet(args, config)
# capsule_net = torch.nn.DataParallel(capsule_net)
if args['USE_CUDA']:
    net = net.cuda()
    
# # freeze All layers except Decoder
to_freeze = [net.conv_layer, net.primary_capsules, net.digit_capsules_1, net.digit_capsules_2]
for layer in to_freeze:
    for param in layer.parameters():
        param.requires_grad = False
    layer.eval()
    
net.load_state_dict(torch.load(os.path.join(model_path, 'CapsNetMNIST.pth'), map_location='cpu'))

<All keys matched successfully>

## Loading Dataset

In [4]:
trainloader, testloader = dataset(args)

## Training CapsuleNet

In [5]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args['LEARNING_RATE'], weight_decay=args['WEIGHTDECAY'])

def train_CapsNet(model, optimizer, train_loader, epoch, args):
    capsule_net = model
    capsule_net.train()
    n_batch = len(train_loader)
    total_loss = 0
    for batch_id, (data, target) in enumerate(tqdm(train_loader)):
        target = torch.sparse.torch.eye(10).index_select(dim=0, index=target)
        if(args['USE_CUDA']):
            data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()
        output, reconstructions, masked = capsule_net(data)
        loss = capsule_net.loss(data, output, target, reconstructions)
        if(batch_id%100==0):
            print(capsule_net.decoder.reconstraction_layers[0].weight[0][:3])
        loss.backward()
        if(batch_id%100==0):
            print("the grads before are", capsule_net.decoder.reconstraction_layers[0].weight.grad[0][:3].data)
        optimizer.step()
        if(batch_id%100==0):
            print(capsule_net.decoder.reconstraction_layers[0].weight[0][:3])
            print("the grads after are", capsule_net.decoder.reconstraction_layers[0].weight.grad[0][:3].data)
        correct = torch.sum(torch.argmax(masked, 1) == torch.argmax(target, 1))
        train_loss = loss.item()
        total_loss += train_loss
        if batch_id % 100 == 0:
            tqdm.write("Epoch: [{}/{}], Batch: [{}/{}], train accuracy: {:.6f}, loss: {:.6f}".format(
                epoch,
                args['N_EPOCHS'],
                batch_id + 1,
                n_batch,
                correct / float(args['BATCH_SIZE']),
                train_loss / float(args['BATCH_SIZE'])
                ))
    tqdm.write('Epoch: [{}/{}], train loss: {:.6f}'.format(epoch,args['N_EPOCHS'],total_loss / len(train_loader.dataset)))

for e in range(1, args['N_EPOCHS'] + 1):
    train_CapsNet(net, optimizer, trainloader, e, args)
    test_CapsNet(net, testloader, e, args)

  1%|▏         | 3/235 [00:00<02:05,  1.84it/s]

tensor([-0.1905, -0.1794, -0.0265], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0002,  0.0003,  0.0002], device='cuda:0')
tensor([-0.1805, -0.1894, -0.0365], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0002,  0.0003,  0.0002], device='cuda:0')
Epoch: [1/30], Batch: [1/235], train accuracy: 0.992188, loss: 0.001790


 44%|████▍     | 103/235 [00:08<00:10, 13.19it/s]

tensor([-0.1836, -0.1958, -0.0241], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0007, -0.0009, -0.0008], device='cuda:0')
tensor([-0.1828, -0.1968, -0.0249], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0007, -0.0009, -0.0008], device='cuda:0')
Epoch: [1/30], Batch: [101/235], train accuracy: 0.976562, loss: 0.000983


 86%|████████▋ | 203/235 [00:15<00:02, 13.15it/s]

tensor([-0.1922, -0.2002, -0.0085], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-7.8829e-05,  9.3259e-05,  8.1180e-05], device='cuda:0')
tensor([-0.1918, -0.2007, -0.0089], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-7.8829e-05,  9.3259e-05,  8.1180e-05], device='cuda:0')
Epoch: [1/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000988


100%|██████████| 235/235 [00:18<00:00, 12.82it/s]

Epoch: [1/30], train loss: 0.001132



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [1/30], test accuracy: 0.985100, loss: 0.249385


  0%|          | 1/235 [00:00<00:43,  5.40it/s]

tensor([-0.1807, -0.2139, -0.0200], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0024,  0.0029,  0.0025], device='cuda:0')
tensor([-0.1798, -0.2149, -0.0208], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0024,  0.0029,  0.0025], device='cuda:0')
Epoch: [2/30], Batch: [1/235], train accuracy: 1.000000, loss: 0.001014


 44%|████▍     | 103/235 [00:07<00:10, 13.07it/s]

tensor([-0.1906, -0.2148, -0.0042], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0005,  0.0006,  0.0005], device='cuda:0')
tensor([-0.1905, -0.2150, -0.0042], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0005,  0.0006,  0.0005], device='cuda:0')
Epoch: [2/30], Batch: [101/235], train accuracy: 0.984375, loss: 0.000969


 86%|████████▋ | 203/235 [00:16<00:02, 11.75it/s]

tensor([-0.1807, -0.2305, -0.0100], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0010,  0.0012,  0.0010], device='cuda:0')
tensor([-0.1820, -0.2293, -0.0086], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0010,  0.0012,  0.0010], device='cuda:0')
Epoch: [2/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000950


100%|██████████| 235/235 [00:18<00:00, 12.50it/s]

Epoch: [2/30], train loss: 0.000961



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [2/30], test accuracy: 0.985100, loss: 0.247195


  0%|          | 1/235 [00:00<00:45,  5.13it/s]

tensor([-0.1890, -0.2233, -0.0005], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0010,  0.0013,  0.0011], device='cuda:0')
tensor([-0.1876, -0.2248, -0.0019], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0010,  0.0013,  0.0011], device='cuda:0')
Epoch: [3/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000966


 44%|████▍     | 103/235 [00:09<00:11, 11.40it/s]

tensor([-0.1833, -0.2338, -0.0042], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0002, -0.0003, -0.0003], device='cuda:0')
tensor([-0.1837, -0.2334, -0.0037], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0002, -0.0003, -0.0003], device='cuda:0')
Epoch: [3/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000974


 86%|████████▋ | 203/235 [00:17<00:02, 11.40it/s]

tensor([-0.1907, -0.2273,  0.0056], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0003,  0.0002,  0.0002], device='cuda:0')
tensor([-0.1903, -0.2277,  0.0053], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0003,  0.0002,  0.0002], device='cuda:0')
Epoch: [3/30], Batch: [201/235], train accuracy: 0.988281, loss: 0.000923


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [3/30], train loss: 0.000940



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [3/30], test accuracy: 0.985100, loss: 0.238543


  0%|          | 1/235 [00:00<00:44,  5.22it/s]

tensor([-0.1718, -0.2469, -0.0101], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0030, -0.0036, -0.0031], device='cuda:0')
tensor([-0.1733, -0.2455, -0.0086], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0030, -0.0036, -0.0031], device='cuda:0')
Epoch: [4/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000936


 44%|████▍     | 103/235 [00:09<00:11, 11.41it/s]

tensor([-0.1863, -0.2344,  0.0074], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0005, -0.0006, -0.0005], device='cuda:0')
tensor([-0.1866, -0.2341,  0.0077], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0005, -0.0006, -0.0005], device='cuda:0')
Epoch: [4/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000868


 86%|████████▋ | 203/235 [00:17<00:02, 11.40it/s]

tensor([-0.1843, -0.2368,  0.0100], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.0916e-05, -1.1202e-04, -1.2423e-04], device='cuda:0')
tensor([-0.1840, -0.2371,  0.0097], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.0916e-05, -1.1202e-04, -1.2423e-04], device='cuda:0')
Epoch: [4/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000961


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [4/30], train loss: 0.000923



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [4/30], test accuracy: 0.985100, loss: 0.233904


  0%|          | 1/235 [00:00<00:45,  5.14it/s]

tensor([-0.1996, -0.2222,  0.0267], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 4.8755e-05, -5.4287e-05, -5.3350e-05], device='cuda:0')
tensor([-0.1992, -0.2226,  0.0262], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 4.8755e-05, -5.4287e-05, -5.3350e-05], device='cuda:0')
Epoch: [5/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000891


 44%|████▍     | 103/235 [00:09<00:11, 11.42it/s]

tensor([-0.1932, -0.2310,  0.0252], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0007,  0.0010,  0.0008], device='cuda:0')
tensor([-0.1926, -0.2316,  0.0247], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0007,  0.0010,  0.0008], device='cuda:0')
Epoch: [5/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000920


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.1947, -0.2316,  0.0294], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0002,  0.0003,  0.0003], device='cuda:0')
tensor([-0.1946, -0.2317,  0.0293], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0002,  0.0003,  0.0003], device='cuda:0')
Epoch: [5/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000894


100%|██████████| 235/235 [00:20<00:00, 11.43it/s]

Epoch: [5/30], train loss: 0.000908



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [5/30], test accuracy: 0.985100, loss: 0.229320


  1%|▏         | 3/235 [00:00<00:41,  5.62it/s]

tensor([-0.1814, -0.2443,  0.0174], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0019, -0.0024, -0.0020], device='cuda:0')
tensor([-0.1814, -0.2443,  0.0173], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0019, -0.0024, -0.0020], device='cuda:0')
Epoch: [6/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000951


 44%|████▍     | 103/235 [00:09<00:11, 11.35it/s]

tensor([-0.1885, -0.2391,  0.0245], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0003, -0.0004, -0.0003], device='cuda:0')
tensor([-0.1889, -0.2388,  0.0248], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0003, -0.0004, -0.0003], device='cuda:0')
Epoch: [6/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000898


 86%|████████▋ | 203/235 [00:17<00:02, 11.35it/s]

tensor([-0.1978, -0.2303,  0.0371], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
tensor([-0.1973, -0.2308,  0.0367], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
Epoch: [6/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000945


100%|██████████| 235/235 [00:20<00:00, 11.39it/s]

Epoch: [6/30], train loss: 0.000897



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [6/30], test accuracy: 0.985100, loss: 0.229492


  0%|          | 1/235 [00:00<00:45,  5.17it/s]

tensor([-0.1947, -0.2330,  0.0337], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-2.2769e-05,  2.8154e-05,  2.1059e-05], device='cuda:0')
tensor([-0.1959, -0.2317,  0.0350], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-2.2769e-05,  2.8154e-05,  2.1059e-05], device='cuda:0')
Epoch: [7/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000878


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.1801, -0.2472,  0.0222], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([7.4242e-06, 6.5575e-06, 4.2600e-06], device='cuda:0')
tensor([-0.1792, -0.2482,  0.0213], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([7.4242e-06, 6.5575e-06, 4.2600e-06], device='cuda:0')
Epoch: [7/30], Batch: [101/235], train accuracy: 0.980469, loss: 0.000895


 86%|████████▋ | 203/235 [00:17<00:02, 11.35it/s]

tensor([-0.1845, -0.2472,  0.0275], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0007,  0.0009,  0.0007], device='cuda:0')
tensor([-0.1845, -0.2472,  0.0277], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0007,  0.0009,  0.0007], device='cuda:0')
Epoch: [7/30], Batch: [201/235], train accuracy: 0.980469, loss: 0.000845


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [7/30], train loss: 0.000888



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [7/30], test accuracy: 0.985100, loss: 0.229051


  0%|          | 1/235 [00:00<00:46,  5.07it/s]

tensor([-0.1956, -0.2361,  0.0385], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 2.3580e-05, -2.0721e-05, -1.4142e-05], device='cuda:0')
tensor([-0.1961, -0.2357,  0.0389], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 2.3580e-05, -2.0721e-05, -1.4142e-05], device='cuda:0')
Epoch: [8/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000877


 44%|████▍     | 103/235 [00:09<00:11, 11.34it/s]

tensor([-0.1940, -0.2407,  0.0390], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0006,  0.0007,  0.0006], device='cuda:0')
tensor([-0.1930, -0.2417,  0.0379], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0006,  0.0007,  0.0006], device='cuda:0')
Epoch: [8/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000854


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.1935, -0.2429,  0.0379], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0009, -0.0011, -0.0009], device='cuda:0')
tensor([-0.1938, -0.2426,  0.0383], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0009, -0.0011, -0.0009], device='cuda:0')
Epoch: [8/30], Batch: [201/235], train accuracy: 0.972656, loss: 0.000880


100%|██████████| 235/235 [00:20<00:00, 11.40it/s]

Epoch: [8/30], train loss: 0.000877



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [8/30], test accuracy: 0.985100, loss: 0.220349


  0%|          | 1/235 [00:00<00:46,  5.05it/s]

tensor([-0.1870, -0.2507,  0.0325], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0012,  0.0015,  0.0013], device='cuda:0')
tensor([-0.1863, -0.2515,  0.0318], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0012,  0.0015,  0.0013], device='cuda:0')
Epoch: [9/30], Batch: [1/235], train accuracy: 0.996094, loss: 0.000827


 44%|████▍     | 103/235 [00:09<00:11, 11.34it/s]

tensor([-0.1823, -0.2552,  0.0278], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
tensor([-0.1809, -0.2564,  0.0266], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
Epoch: [9/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000803


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2033, -0.2361,  0.0515], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-8.9191e-05,  1.0781e-04,  9.1365e-05], device='cuda:0')
tensor([-0.2026, -0.2368,  0.0507], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-8.9191e-05,  1.0781e-04,  9.1365e-05], device='cuda:0')
Epoch: [9/30], Batch: [201/235], train accuracy: 0.980469, loss: 0.000871


100%|██████████| 235/235 [00:20<00:00, 11.40it/s]

Epoch: [9/30], train loss: 0.000869



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [9/30], test accuracy: 0.985100, loss: 0.221794


  0%|          | 1/235 [00:00<00:45,  5.15it/s]

tensor([-0.2068, -0.2347,  0.0534], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0002,  0.0002,  0.0002], device='cuda:0')
tensor([-0.2067, -0.2348,  0.0534], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0002,  0.0002,  0.0002], device='cuda:0')
Epoch: [10/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000871


 44%|████▍     | 103/235 [00:09<00:11, 11.38it/s]

tensor([-0.2081, -0.2369,  0.0529], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0007,  0.0011,  0.0009], device='cuda:0')
tensor([-0.2081, -0.2370,  0.0528], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0007,  0.0011,  0.0009], device='cuda:0')
Epoch: [10/30], Batch: [101/235], train accuracy: 0.996094, loss: 0.000771


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2271, -0.2189,  0.0697], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 6.0578e-05, -5.8219e-05, -6.6843e-05], device='cuda:0')
tensor([-0.2269, -0.2192,  0.0695], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 6.0578e-05, -5.8219e-05, -6.6843e-05], device='cuda:0')
Epoch: [10/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000850


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [10/30], train loss: 0.000862



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [10/30], test accuracy: 0.985100, loss: 0.221472


  0%|          | 1/235 [00:00<00:45,  5.10it/s]

tensor([-0.2112, -0.2365,  0.0529], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0005,  0.0006,  0.0006], device='cuda:0')
tensor([-0.2116, -0.2359,  0.0533], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0005,  0.0006,  0.0006], device='cuda:0')
Epoch: [11/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000813


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.2036, -0.2407,  0.0461], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 5.2729e-05, -1.5484e-05, -5.8260e-05], device='cuda:0')
tensor([-0.2027, -0.2417,  0.0451], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 5.2729e-05, -1.5484e-05, -5.8260e-05], device='cuda:0')
Epoch: [11/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000899


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2064, -0.2377,  0.0428], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0011,  0.0014,  0.0012], device='cuda:0')
tensor([-0.2070, -0.2372,  0.0433], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0011,  0.0014,  0.0012], device='cuda:0')
Epoch: [11/30], Batch: [201/235], train accuracy: 0.988281, loss: 0.000883


100%|██████████| 235/235 [00:20<00:00, 11.40it/s]

Epoch: [11/30], train loss: 0.000857



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [11/30], test accuracy: 0.985100, loss: 0.222711


  0%|          | 1/235 [00:00<00:45,  5.14it/s]

tensor([-0.2161, -0.2303,  0.0511], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0007, -0.0008, -0.0007], device='cuda:0')
tensor([-0.2172, -0.2292,  0.0521], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0007, -0.0008, -0.0007], device='cuda:0')
Epoch: [12/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000891


 44%|████▍     | 103/235 [00:09<00:11, 11.38it/s]

tensor([-0.2279, -0.2215,  0.0611], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-6.2492e-05,  9.3935e-05,  7.1941e-05], device='cuda:0')
tensor([-0.2272, -0.2224,  0.0603], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-6.2492e-05,  9.3935e-05,  7.1941e-05], device='cuda:0')
Epoch: [12/30], Batch: [101/235], train accuracy: 0.996094, loss: 0.000863


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.2232, -0.2263,  0.0541], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0002, -0.0002], device='cuda:0')
tensor([-0.2219, -0.2276,  0.0528], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0002, -0.0002], device='cuda:0')
Epoch: [12/30], Batch: [201/235], train accuracy: 0.980469, loss: 0.000805


100%|██████████| 235/235 [00:20<00:00, 11.42it/s]

Epoch: [12/30], train loss: 0.000853



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [12/30], test accuracy: 0.985100, loss: 0.219928


  0%|          | 1/235 [00:00<00:45,  5.09it/s]

tensor([-0.2205, -0.2311,  0.0478], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0003, -0.0004, -0.0003], device='cuda:0')
tensor([-0.2220, -0.2296,  0.0491], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0003, -0.0004, -0.0003], device='cuda:0')
Epoch: [13/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000870


 44%|████▍     | 103/235 [00:09<00:11, 11.39it/s]

tensor([-0.2665, -0.1891,  0.0910], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
tensor([-0.2669, -0.1888,  0.0913], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
Epoch: [13/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000858


 86%|████████▋ | 203/235 [00:17<00:02, 11.41it/s]

tensor([-0.2698, -0.1865,  0.0944], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-2.7726e-06, -1.8619e-06,  1.7734e-06], device='cuda:0')
tensor([-0.2700, -0.1863,  0.0946], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-2.7726e-06, -1.8619e-06,  1.7734e-06], device='cuda:0')
Epoch: [13/30], Batch: [201/235], train accuracy: 0.976562, loss: 0.000848


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [13/30], train loss: 0.000847



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [13/30], test accuracy: 0.985100, loss: 0.216291


  0%|          | 1/235 [00:00<00:46,  5.08it/s]

tensor([-0.2694, -0.1871,  0.0939], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 4.5967e-07, -2.6524e-06, -2.5897e-08], device='cuda:0')
tensor([-0.2697, -0.1868,  0.0942], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 4.5967e-07, -2.6524e-06, -2.5897e-08], device='cuda:0')
Epoch: [14/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000874


 44%|████▍     | 103/235 [00:09<00:11, 11.39it/s]

tensor([-0.2700, -0.1871,  0.0950], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.2699, -0.1871,  0.0950], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [14/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000827


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2723, -0.1849,  0.0970], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0002, -0.0002, -0.0002], device='cuda:0')
tensor([-0.2726, -0.1846,  0.0973], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0002, -0.0002, -0.0002], device='cuda:0')
Epoch: [14/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000821


100%|██████████| 235/235 [00:20<00:00, 11.43it/s]

Epoch: [14/30], train loss: 0.000838



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [14/30], test accuracy: 0.985100, loss: 0.218511


  0%|          | 1/235 [00:00<00:45,  5.11it/s]

tensor([-0.2716, -0.1856,  0.0959], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-8.5851e-05,  1.0197e-04,  8.8540e-05], device='cuda:0')
tensor([-0.2718, -0.1853,  0.0961], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-8.5851e-05,  1.0197e-04,  8.8540e-05], device='cuda:0')
Epoch: [15/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000841


 44%|████▍     | 103/235 [00:09<00:11, 11.39it/s]

tensor([-0.2700, -0.1880,  0.0943], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.2698, -0.1882,  0.0941], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [15/30], Batch: [101/235], train accuracy: 0.996094, loss: 0.000846


 86%|████████▋ | 203/235 [00:17<00:02, 11.38it/s]

tensor([-0.2702, -0.1895,  0.0935], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0002, -0.0001], device='cuda:0')
tensor([-0.2704, -0.1893,  0.0937], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0002, -0.0001], device='cuda:0')
Epoch: [15/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000806


100%|██████████| 235/235 [00:20<00:00, 11.43it/s]

Epoch: [15/30], train loss: 0.000839



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [15/30], test accuracy: 0.985100, loss: 0.220259


  0%|          | 1/235 [00:00<00:44,  5.23it/s]

tensor([-0.2674, -0.1928,  0.0905], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 3.0558e-05, -4.3343e-05, -3.3907e-05], device='cuda:0')
tensor([-0.2674, -0.1927,  0.0906], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 3.0558e-05, -4.3343e-05, -3.3907e-05], device='cuda:0')
Epoch: [16/30], Batch: [1/235], train accuracy: 0.984375, loss: 0.000779


 44%|████▍     | 103/235 [00:09<00:11, 11.40it/s]

tensor([-0.2719, -0.1885,  0.0954], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 2.2361e-06, -7.3591e-06,  4.6246e-06], device='cuda:0')
tensor([-0.2720, -0.1884,  0.0954], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 2.2361e-06, -7.3591e-06,  4.6246e-06], device='cuda:0')
Epoch: [16/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000777


 86%|████████▋ | 203/235 [00:17<00:02, 11.40it/s]

tensor([-0.2724, -0.1898,  0.0957], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-8.1117e-05,  1.0705e-04,  8.8629e-05], device='cuda:0')
tensor([-0.2723, -0.1898,  0.0957], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-8.1117e-05,  1.0705e-04,  8.8629e-05], device='cuda:0')
Epoch: [16/30], Batch: [201/235], train accuracy: 0.972656, loss: 0.000804


100%|██████████| 235/235 [00:20<00:00, 11.42it/s]

Epoch: [16/30], train loss: 0.000833



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [16/30], test accuracy: 0.985100, loss: 0.215123


  0%|          | 1/235 [00:00<00:43,  5.36it/s]

tensor([-0.2705, -0.1920,  0.0934], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-3.4194e-05,  5.0058e-05,  3.7834e-05], device='cuda:0')
tensor([-0.2708, -0.1917,  0.0937], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-3.4194e-05,  5.0058e-05,  3.7834e-05], device='cuda:0')
Epoch: [17/30], Batch: [1/235], train accuracy: 0.996094, loss: 0.000826


 44%|████▍     | 103/235 [00:09<00:11, 11.28it/s]

tensor([-0.2703, -0.1932,  0.0928], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 5.8428e-05, -7.1148e-05, -6.2590e-05], device='cuda:0')
tensor([-0.2710, -0.1925,  0.0936], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 5.8428e-05, -7.1148e-05, -6.2590e-05], device='cuda:0')
Epoch: [17/30], Batch: [101/235], train accuracy: 0.976562, loss: 0.000880


 86%|████████▋ | 203/235 [00:17<00:02, 11.34it/s]

tensor([-0.2751, -0.1894,  0.0976], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 5.3181e-05, -6.4655e-05, -5.3993e-05], device='cuda:0')
tensor([-0.2753, -0.1892,  0.0978], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 5.3181e-05, -6.4655e-05, -5.3993e-05], device='cuda:0')
Epoch: [17/30], Batch: [201/235], train accuracy: 0.964844, loss: 0.000820


100%|██████████| 235/235 [00:20<00:00, 11.40it/s]

Epoch: [17/30], train loss: 0.000827



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [17/30], test accuracy: 0.985100, loss: 0.217598


  0%|          | 0/235 [00:00<?, ?it/s]

tensor([-0.2761, -0.1891,  0.0985], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 6.3696e-05, -7.6039e-05, -7.0929e-05], device='cuda:0')
tensor([-0.2758, -0.1894,  0.0981], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 6.3696e-05, -7.6039e-05, -7.0929e-05], device='cuda:0')
Epoch: [18/30], Batch: [1/235], train accuracy: 0.976562, loss: 0.000821


 44%|████▍     | 103/235 [00:09<00:11, 11.38it/s]

tensor([-0.2757, -0.1912,  0.0978], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.6591e-06, -1.6512e-05, -1.5774e-05], device='cuda:0')
tensor([-0.2757, -0.1913,  0.0977], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.6591e-06, -1.6512e-05, -1.5774e-05], device='cuda:0')
Epoch: [18/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000802


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.2788, -0.1892,  0.1003], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-3.9258e-05,  4.1816e-05,  3.6819e-05], device='cuda:0')
tensor([-0.2790, -0.1890,  0.1006], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-3.9258e-05,  4.1816e-05,  3.6819e-05], device='cuda:0')
Epoch: [18/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000836


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [18/30], train loss: 0.000831



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [18/30], test accuracy: 0.985100, loss: 0.221399


  0%|          | 1/235 [00:00<00:46,  5.06it/s]

tensor([-0.2826, -0.1865,  0.1037], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 2.0322e-05, -2.5260e-05, -2.3780e-05], device='cuda:0')
tensor([-0.2829, -0.1862,  0.1040], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 2.0322e-05, -2.5260e-05, -2.3780e-05], device='cuda:0')
Epoch: [19/30], Batch: [1/235], train accuracy: 0.996094, loss: 0.000883


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.2833, -0.1854,  0.1051], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
tensor([-0.2837, -0.1850,  0.1055], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
Epoch: [19/30], Batch: [101/235], train accuracy: 1.000000, loss: 0.000804


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.2802, -0.1906,  0.1013], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0003,  0.0003,  0.0003], device='cuda:0')
tensor([-0.2805, -0.1904,  0.1016], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0003,  0.0003,  0.0003], device='cuda:0')
Epoch: [19/30], Batch: [201/235], train accuracy: 0.980469, loss: 0.000794


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [19/30], train loss: 0.000823



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [19/30], test accuracy: 0.985100, loss: 0.209316


  0%|          | 1/235 [00:00<00:45,  5.11it/s]

tensor([-0.2816, -0.1897,  0.1021], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
tensor([-0.2814, -0.1900,  0.1019], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
Epoch: [20/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000817


 44%|████▍     | 103/235 [00:09<00:11, 11.42it/s]

tensor([-0.2785, -0.1945,  0.0981], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.2777, -0.1953,  0.0973], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [20/30], Batch: [101/235], train accuracy: 0.980469, loss: 0.000797


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.2817, -0.1925,  0.1014], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-3.5908e-05,  1.0955e-05,  3.9810e-05], device='cuda:0')
tensor([-0.2809, -0.1932,  0.1006], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-3.5908e-05,  1.0955e-05,  3.9810e-05], device='cuda:0')
Epoch: [20/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000850


100%|██████████| 235/235 [00:20<00:00, 11.43it/s]

Epoch: [20/30], train loss: 0.000820



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [20/30], test accuracy: 0.985100, loss: 0.220773


  0%|          | 1/235 [00:00<00:44,  5.20it/s]

tensor([-0.2811, -0.1936,  0.1009], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
tensor([-0.2813, -0.1934,  0.1011], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0001, -0.0001], device='cuda:0')
Epoch: [21/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000866


 44%|████▍     | 103/235 [00:09<00:11, 11.39it/s]

tensor([-0.2832, -0.1938,  0.1020], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-2.6434e-05,  3.2331e-05,  2.6113e-05], device='cuda:0')
tensor([-0.2826, -0.1943,  0.1015], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-2.6434e-05,  3.2331e-05,  2.6113e-05], device='cuda:0')
Epoch: [21/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000844


 86%|████████▋ | 203/235 [00:17<00:02, 11.40it/s]

tensor([-0.2797, -0.1968,  0.1000], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-7.9209e-05,  9.8801e-05,  8.4126e-05], device='cuda:0')
tensor([-0.2806, -0.1958,  0.1009], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-7.9209e-05,  9.8801e-05,  8.4126e-05], device='cuda:0')
Epoch: [21/30], Batch: [201/235], train accuracy: 0.992188, loss: 0.000785


100%|██████████| 235/235 [00:20<00:00, 11.43it/s]

Epoch: [21/30], train loss: 0.000816



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [21/30], test accuracy: 0.985100, loss: 0.214504


  0%|          | 1/235 [00:00<00:45,  5.13it/s]

tensor([-0.2868, -0.1909,  0.1064], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0004,  0.0005,  0.0004], device='cuda:0')
tensor([-0.2852, -0.1925,  0.1047], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0004,  0.0005,  0.0004], device='cuda:0')
Epoch: [22/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000801


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.2674, -0.2121,  0.0869], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.3183e-05, -1.1243e-04, -9.9646e-05], device='cuda:0')
tensor([-0.2670, -0.2125,  0.0865], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.3183e-05, -1.1243e-04, -9.9646e-05], device='cuda:0')
Epoch: [22/30], Batch: [101/235], train accuracy: 0.992188, loss: 0.000778


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2657, -0.2156,  0.0855], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-5.9666e-05,  7.7145e-05,  6.5534e-05], device='cuda:0')
tensor([-0.2660, -0.2152,  0.0859], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-5.9666e-05,  7.7145e-05,  6.5534e-05], device='cuda:0')
Epoch: [22/30], Batch: [201/235], train accuracy: 0.996094, loss: 0.000878


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [22/30], train loss: 0.000812



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [22/30], test accuracy: 0.985100, loss: 0.211158


  0%|          | 1/235 [00:00<00:45,  5.10it/s]

tensor([-0.2756, -0.2053,  0.0962], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 7.9413e-06, -8.1211e-06, -1.0715e-05], device='cuda:0')
tensor([-0.2748, -0.2060,  0.0953], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 7.9413e-06, -8.1211e-06, -1.0715e-05], device='cuda:0')
Epoch: [23/30], Batch: [1/235], train accuracy: 0.980469, loss: 0.000842


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.2830, -0.1985,  0.1012], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
tensor([-0.2822, -0.1993,  0.1005], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
Epoch: [23/30], Batch: [101/235], train accuracy: 0.984375, loss: 0.000760


 86%|████████▋ | 203/235 [00:17<00:02, 11.39it/s]

tensor([-0.2443, -0.2417,  0.0576], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
tensor([-0.2435, -0.2427,  0.0567], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0002, -0.0003, -0.0002], device='cuda:0')
Epoch: [23/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000833


100%|██████████| 235/235 [00:20<00:00, 11.42it/s]

Epoch: [23/30], train loss: 0.000810



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [23/30], test accuracy: 0.985100, loss: 0.211670


  0%|          | 1/235 [00:00<00:44,  5.21it/s]

tensor([-0.2425, -0.2454,  0.0538], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0019, -0.0025, -0.0021], device='cuda:0')
tensor([-0.2482, -0.2395,  0.0593], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0019, -0.0025, -0.0021], device='cuda:0')
Epoch: [24/30], Batch: [1/235], train accuracy: 1.000000, loss: 0.000820


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.3043, -0.1833,  0.1120], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.3223e-05, -1.1268e-04, -1.0210e-04], device='cuda:0')
tensor([-0.3046, -0.1831,  0.1123], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.3223e-05, -1.1268e-04, -1.0210e-04], device='cuda:0')
Epoch: [24/30], Batch: [101/235], train accuracy: 0.980469, loss: 0.000783


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.2986, -0.1900,  0.1072], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0001, -0.0002, -0.0001], device='cuda:0')
tensor([-0.2986, -0.1899,  0.1073], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0001, -0.0002, -0.0001], device='cuda:0')
Epoch: [24/30], Batch: [201/235], train accuracy: 0.988281, loss: 0.000816


100%|██████████| 235/235 [00:20<00:00, 11.42it/s]

Epoch: [24/30], train loss: 0.000808



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [24/30], test accuracy: 0.985100, loss: 0.210946


  0%|          | 1/235 [00:00<00:45,  5.14it/s]

tensor([-0.3071, -0.1817,  0.1160], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.3073, -0.1815,  0.1162], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [25/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000783


 44%|████▍     | 103/235 [00:09<00:11, 11.42it/s]

tensor([-0.3012, -0.1880,  0.1095], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 1.7209e-05, -2.2682e-05, -2.0365e-05], device='cuda:0')
tensor([-0.3024, -0.1869,  0.1106], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 1.7209e-05, -2.2682e-05, -2.0365e-05], device='cuda:0')
Epoch: [25/30], Batch: [101/235], train accuracy: 0.972656, loss: 0.000792


 86%|████████▋ | 203/235 [00:17<00:02, 11.39it/s]

tensor([-0.2929, -0.1972,  0.1011], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.0322e-05, -1.0479e-04, -9.3220e-05], device='cuda:0')
tensor([-0.2933, -0.1968,  0.1014], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.0322e-05, -1.0479e-04, -9.3220e-05], device='cuda:0')
Epoch: [25/30], Batch: [201/235], train accuracy: 0.992188, loss: 0.000814


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [25/30], train loss: 0.000803



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [25/30], test accuracy: 0.985100, loss: 0.210629


  0%|          | 1/235 [00:00<00:45,  5.12it/s]

tensor([-0.2904, -0.2004,  0.0981], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.2899, -0.2009,  0.0976], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [26/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000826


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.2972, -0.1955,  0.1048], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 6.9483e-05, -8.1687e-05, -7.4281e-05], device='cuda:0')
tensor([-0.2961, -0.1966,  0.1038], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 6.9483e-05, -8.1687e-05, -7.4281e-05], device='cuda:0')
Epoch: [26/30], Batch: [101/235], train accuracy: 0.984375, loss: 0.000798


 86%|████████▋ | 203/235 [00:17<00:02, 11.36it/s]

tensor([-0.2733, -0.2224,  0.0760], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 0.0005, -0.0006, -0.0005], device='cuda:0')
tensor([-0.2741, -0.2216,  0.0768], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 0.0005, -0.0006, -0.0005], device='cuda:0')
Epoch: [26/30], Batch: [201/235], train accuracy: 0.988281, loss: 0.000791


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [26/30], train loss: 0.000806



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [26/30], test accuracy: 0.985100, loss: 0.211049


  0%|          | 1/235 [00:00<00:45,  5.20it/s]

tensor([-0.2987, -0.1956,  0.1018], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 1.6050e-05, -2.0230e-05, -1.8178e-05], device='cuda:0')
tensor([-0.2993, -0.1949,  0.1025], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 1.6050e-05, -2.0230e-05, -1.8178e-05], device='cuda:0')
Epoch: [27/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000822


 44%|████▍     | 103/235 [00:09<00:11, 11.40it/s]

tensor([-0.2807, -0.2133,  0.0830], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-3.1515e-05,  3.4508e-05,  3.0589e-05], device='cuda:0')
tensor([-0.2819, -0.2123,  0.0841], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-3.1515e-05,  3.4508e-05,  3.0589e-05], device='cuda:0')
Epoch: [27/30], Batch: [101/235], train accuracy: 0.988281, loss: 0.000857


 86%|████████▋ | 203/235 [00:17<00:02, 11.40it/s]

tensor([-0.3049, -0.1901,  0.1033], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 1.7018e-05, -2.2570e-06, -5.4725e-06], device='cuda:0')
tensor([-0.3046, -0.1905,  0.1029], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 1.7018e-05, -2.2570e-06, -5.4725e-06], device='cuda:0')
Epoch: [27/30], Batch: [201/235], train accuracy: 0.972656, loss: 0.000801


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [27/30], train loss: 0.000801



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [27/30], test accuracy: 0.985100, loss: 0.210372


  0%|          | 1/235 [00:00<00:45,  5.18it/s]

tensor([-0.3004, -0.1962,  0.0983], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
tensor([-0.3011, -0.1956,  0.0989], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0002,  0.0001], device='cuda:0')
Epoch: [28/30], Batch: [1/235], train accuracy: 0.988281, loss: 0.000811


 44%|████▍     | 103/235 [00:09<00:11, 11.38it/s]

tensor([-0.3100, -0.1927,  0.0987], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0002,  0.0002,  0.0002], device='cuda:0')
tensor([-0.3105, -0.1922,  0.0993], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0002,  0.0002,  0.0002], device='cuda:0')
Epoch: [28/30], Batch: [101/235], train accuracy: 0.976562, loss: 0.000809


 86%|████████▋ | 203/235 [00:17<00:02, 11.42it/s]

tensor([-0.3099, -0.1947,  0.0982], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
tensor([-0.3105, -0.1941,  0.0987], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-0.0001,  0.0001,  0.0001], device='cuda:0')
Epoch: [28/30], Batch: [201/235], train accuracy: 0.992188, loss: 0.000787


100%|██████████| 235/235 [00:20<00:00, 11.44it/s]

Epoch: [28/30], train loss: 0.000799



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [28/30], test accuracy: 0.985100, loss: 0.205464


  0%|          | 1/235 [00:00<00:45,  5.10it/s]

tensor([-0.3124, -0.1920,  0.1007], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 8.5903e-05, -1.0447e-04, -9.2103e-05], device='cuda:0')
tensor([-0.3117, -0.1927,  0.1000], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 8.5903e-05, -1.0447e-04, -9.2103e-05], device='cuda:0')
Epoch: [29/30], Batch: [1/235], train accuracy: 1.000000, loss: 0.000726


 44%|████▍     | 103/235 [00:09<00:11, 11.35it/s]

tensor([-0.3319, -0.1772,  0.1161], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-2.5930e-05,  3.7251e-05,  3.0558e-05], device='cuda:0')
tensor([-0.3345, -0.1746,  0.1186], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-2.5930e-05,  3.7251e-05,  3.0558e-05], device='cuda:0')
Epoch: [29/30], Batch: [101/235], train accuracy: 0.984375, loss: 0.000816


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.3561, -0.1555,  0.1391], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-4.3809e-05,  6.2079e-05,  5.0029e-05], device='cuda:0')
tensor([-0.3561, -0.1556,  0.1390], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-4.3809e-05,  6.2079e-05,  5.0029e-05], device='cuda:0')
Epoch: [29/30], Batch: [201/235], train accuracy: 0.984375, loss: 0.000780


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [29/30], train loss: 0.000796



  0%|          | 0/235 [00:00<?, ?it/s]

Epoch: [29/30], test accuracy: 0.985100, loss: 0.207899


  0%|          | 1/235 [00:00<00:46,  5.08it/s]

tensor([-0.3513, -0.1619,  0.1337], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([ 9.0154e-05, -1.1879e-04, -1.0595e-04], device='cuda:0')
tensor([-0.3517, -0.1616,  0.1340], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([ 9.0154e-05, -1.1879e-04, -1.0595e-04], device='cuda:0')
Epoch: [30/30], Batch: [1/235], train accuracy: 0.968750, loss: 0.000817


 44%|████▍     | 103/235 [00:09<00:11, 11.36it/s]

tensor([-0.3363, -0.1802,  0.1151], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-7.3695e-05,  9.5779e-05,  8.1121e-05], device='cuda:0')
tensor([-0.3355, -0.1810,  0.1143], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-7.3695e-05,  9.5779e-05,  8.1121e-05], device='cuda:0')
Epoch: [30/30], Batch: [101/235], train accuracy: 0.980469, loss: 0.000792


 86%|████████▋ | 203/235 [00:17<00:02, 11.37it/s]

tensor([-0.3354, -0.1809,  0.1146], device='cuda:0', grad_fn=<SliceBackward>)
the grads before are tensor([-1.3398e-05,  1.8941e-05,  1.3899e-05], device='cuda:0')
tensor([-0.3345, -0.1818,  0.1137], device='cuda:0', grad_fn=<SliceBackward>)
the grads after are tensor([-1.3398e-05,  1.8941e-05,  1.3899e-05], device='cuda:0')
Epoch: [30/30], Batch: [201/235], train accuracy: 0.968750, loss: 0.000812


100%|██████████| 235/235 [00:20<00:00, 11.41it/s]

Epoch: [30/30], train loss: 0.000791





Epoch: [30/30], test accuracy: 0.985100, loss: 0.207699


In [6]:
torch.save(net.state_dict(), "./CapsNetMNIST_Recon.pth")

In [7]:
for param in net.parameters():
    if(param.requires_grad==True):
        print(param.shape)

torch.Size([512, 160])
torch.Size([512])
torch.Size([1024, 512])
torch.Size([1024])
torch.Size([784, 1024])
torch.Size([784])


In [None]:
for p in net.parameters():
    

In [10]:
torch.tensor(0.1307).div(2)

tensor(0.0654)

In [None]:
from torchvision import transforms

a = transforms.Normalize((0.1307,), (0.3081,))
a.std

In [None]:
#Config for 16 1d vectors in Capsule Layer. Set the Softmax Dimension to 1 in this case
# class Config:
#     def __init__(self, dataset='mnist'):
#         # CNN (cnn)
#         self.cnn_in_channels = 1
#         self.cnn_out_channels = 12
#         self.cnn_kernel_size = 15

#         # Primary Capsule (pc)
#         self.pc_num_capsules = 1
#         self.pc_in_channels = 12
#         self.pc_out_channels = 16
#         self.pc_kernel_size = 8
#         self.pc_num_routes = 16 * 7 * 7

#         # Digit Capsule 1 (dc)
#         self.dc_num_capsules = 49
#         self.dc_num_routes = 16 * 7 * 7
#         self.dc_in_channels = 1
#         self.dc_out_channels = 1 #16
        
#         # Digit Capsule 2 (dc)
#         self.dc_2_num_capsules = 10
#         self.dc_2_num_routes = 7 * 7
#         self.dc_2_in_channels = 1 #16
#         self.dc_2_out_channels = 16

#         # Decoder
#         self.input_width = 28
#         self.input_height = 28