### Suggestion: Fix conflicting installations

Conflicting package installations found. Depending on the order of
installations and uninstallations, behavior may be undefined. Please
uninstall ALL versions of TensorFlow and TensorBoard, then reinstall
ONLY the desired version of TensorFlow, which will transitively pull
in the proper version of TensorBoard. (If you use TensorBoard without
TensorFlow, just reinstall the appropriate version of TensorBoard
directly.)

Namely:

	pip uninstall tb-nightly tensorboard tensorflow-estimator tensorflow-gpu tf-estimator-nightly
	pip install tensorflow  # or `tensorflow-gpu`, or `tf-nightly`, ...

In [1]:
# imports
import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm

# transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))])

# datasets, I have already downloaded
trainset = torchvision.datasets.FashionMNIST('~/.pytorch',
    download=True, train=True, transform=transform)
testset = torchvision.datasets.FashionMNIST('~/.pytorch',
    download=True, train=False, transform=transform)

# dataloaders
batch_size = 1000
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

# constant for classes
classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')

# helper function to show an image
# (used in the `plot_classes_preds` function below)
def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(torch.permute(npimg, (1, 2, 0)).numpy())

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [3]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
tb = SummaryWriter('runs/fashion_mnist_experiment_1')

In [4]:
# runnig on http://localhost:6006/
# tensorboard --logdir=runs # copy to commandline

In [5]:
@torch.no_grad()
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()
@torch.no_grad()
def get_all_preds(model, loader):
    all_preds = torch.tensor([])
    for batch in loader:
        images, labels = batch
        
        preds = model(images)
        all_preds = torch.cat(
            (all_preds, preds),
            dim=0
        )
    return all_preds

In [6]:
for name, weight in net.named_parameters():
    print(name, weight.shape)

conv1.weight torch.Size([6, 1, 5, 5])
conv1.bias torch.Size([6])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
fc1.weight torch.Size([120, 256])
fc1.bias torch.Size([120])
fc2.weight torch.Size([84, 120])
fc2.bias torch.Size([84])
fc3.weight torch.Size([10, 84])
fc3.bias torch.Size([10])


In [7]:
from itertools import product

In [8]:
# 2 * 3 * 2 = 12 different training sessions
parameters = dict(
    lr = [.01, .001],
    batch_size = [10, 100, 1000],
    shuffle = [True, False]
)

In [9]:
param_values = [v for v in parameters.values()]
param_values

[[0.01, 0.001], [10, 100, 1000], [True, False]]

In [10]:
# let's see what is goning on.
for lr, batch_size, shuffle in product(*param_values):
    print(lr, batch_size, shuffle)

0.01 10 True
0.01 10 False
0.01 100 True
0.01 100 False
0.01 1000 True
0.01 1000 False
0.001 10 True
0.001 10 False
0.001 100 True
0.001 100 False
0.001 1000 True
0.001 1000 False


In [11]:
# let's see what is goning on.
for lr, batch_size, shuffle in product(*param_values): 
    comment = f' batch_size={batch_size} lr={lr} shuffle={shuffle}'

    # Training process given the set of parameters

In [12]:
epochs = 10
batch_size_list=[100, 1000, 10000]
lr_list=[.01, .001, .0001, .00001]
criterion = nn.CrossEntropyLoss()

In [13]:
# loop over for batch_size and lr_list hyper parameter
for lr, batch_size, shuffle in tqdm(product(*param_values), 'hyper parameter testing'):
    net = Net()
    # lr = 0.001
    trainset = torchvision.datasets.FashionMNIST('~/.pytorch',
    download=True, train=True, transform=transform)

    optimizer = optim.Adam(net.parameters(), lr=lr)

    #Not necessary. Test it.
    images, labels = next(iter(trainloader))
    grid = torchvision.utils.make_grid(images)

    comment = f' batch_size={batch_size} lr={lr} shuffle={shuffle}'
    tb=SummaryWriter(comment=comment)
    #Not necessary. Test it.
    tb.add_image('images', grid)
    tb.add_graph(net, images)

    # main training loop
    for epoch in tqdm(range(epochs), 'Training'):

        total_loss = 0
        total_correct = 0

        for batch in trainloader: # Get Batch
            # Pass Batch
            images, labels = batch
            # forward pass
            outputs = net(images)
            # Calculate Loss
            loss = criterion(outputs, labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Calculate Gradient
            loss.backward()
            # Update Weights
            optimizer.step()
            
            # or total_loss+=loss.item()*images.shape[0]
            total_loss += loss.item() * batch_size # for comporable batch sizes
            total_correct += get_num_correct(outputs, labels)

        # epoch <-> epoch * len(trainloader) + i
        tb.add_scalar('Loss', total_loss, epoch)
        tb.add_scalar('Number Correct', total_correct, epoch)
        tb.add_scalar('Accuracy', total_correct / len(trainset), epoch)

        #tb.add_histogram('conv1.bias', net.conv1.bias, epoch)
        #tb.add_histogram('conv1.weight', net.conv1.weight, epoch)
        #tb.add_histogram('conv1.weight.grad', net.conv1.weight.grad, epoch)
        for name, weight in net.named_parameters():
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f'{name}.grad', weight.grad, epoch) # grads also has same shape

        print("epoch", epoch, 
              "total_correct:", total_correct, 
              "loss:", total_loss
              )

    tb.close()
print("training finished")

hyper parameter testing: 0it [00:00, ?it/s]
Training:   0%|          | 0/10 [00:00<?, ?it/s][A
Training:  10%|█         | 1/10 [00:11<01:43, 11.46s/it][A
Training:  20%|██        | 2/10 [00:22<01:30, 11.34s/it][A
Training:  30%|███       | 3/10 [00:33<01:18, 11.28s/it][A
Training:  40%|████      | 4/10 [00:45<01:07, 11.33s/it][A
Training:  50%|█████     | 5/10 [00:56<00:56, 11.26s/it][A
Training:  60%|██████    | 6/10 [01:07<00:45, 11.28s/it][A
Training:  70%|███████   | 7/10 [01:18<00:33, 11.18s/it][A
Training:  80%|████████  | 8/10 [01:29<00:22, 11.17s/it][A
Training:  90%|█████████ | 9/10 [01:40<00:11, 11.15s/it][A
Training: 100%|██████████| 10/10 [01:51<00:00, 11.18s/it][A
hyper parameter testing: 1it [01:54, 114.58s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s][A
Training:  10%|█         | 1/10 [00:11<01:40, 11.21s/it][A
Training:  20%|██        | 2/10 [00:22<01:29, 11.24s/it][A
Training:  30%|███       | 3/10 [00:33<01:18, 11.28s/it][A
Training:  40%|████   

epoch 0 total_correct: 39257 loss: 541.1765885353088
epoch 1 total_correct: 49602 loss: 277.9078522324562
epoch 2 total_correct: 51468 loss: 229.14370715618134
epoch 3 total_correct: 52532 loss: 201.99220895767212
epoch 4 total_correct: 53102 loss: 185.3171768784523
epoch 5 total_correct: 53491 loss: 176.01554304361343
epoch 6 total_correct: 53928 loss: 162.99029365181923
epoch 7 total_correct: 53885 loss: 163.12776997685432
epoch 8 total_correct: 54306 loss: 152.14353054761887
epoch 9 total_correct: 54592 loss: 145.2783463895321
epoch 0 total_correct: 40097 loss: 516.8755614757538
epoch 1 total_correct: 49363 loss: 277.67229348421097
epoch 2 total_correct: 51513 loss: 226.73743546009064
epoch 3 total_correct: 52419 loss: 205.15164077281952
epoch 4 total_correct: 52953 loss: 189.8842516541481
epoch 5 total_correct: 53443 loss: 177.29526296257973
epoch 6 total_correct: 53815 loss: 168.05337235331535
epoch 7 total_correct: 54125 loss: 157.8394030034542
epoch 8 total_correct: 54305 loss: 