In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transform


class Network(nn.Module):
    def __init__(self, channels=1): # default grayscale
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=channels, out_channels=6, kernel_size=5) 
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120) # ((28-5+1)/2 -5 +1)/2 = 4
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):        
        # hidden conv layers, conv w/ relu activation -> max pool
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # hidden fully connected layers
        t = t.reshape(-1, 12*4*4) # flatten
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        
        # output layer
        t = self.out(t)
        return t
    
def get_num_correct(preds, labels):
    return (preds.argmax(dim=1) == labels).sum().item()

### Pipeline

Prepare the data -> Build the model -> Train the model -> __Analyze the model's results__

To aid in the analysis, we use TensorBoard.

In [0]:
import torch
from torch.utils.tensorboard import SummaryWriter

print(torch.__version__)
!tensorboard --version

1.2.0
1.15.0a20190806


### TensorBoard: TensorFlow's Visualization Toolkit

TensorBoard provides the visualization and tooling needed for machine learning experimentation:

* Tracking and visualizing metrics such as loss and accuracy
* Visualizing the model graph (ops and layers)
* Viewing histograms of weights, biases, or other tensors as they change over time
* Projecting embeddings to a lower dimensional space
* Displaying images, text, and audio data
* Profiling TensorFlow programs
* And much more

In [0]:
# Toy example, writing an image
from torch.utils.tensorboard import SummaryWriter

tb = SummaryWriter()
network = Network()

images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)
tb.add_image('images', grid)
# tb.add_graph(network, images)

tb.close()

Enter `tensorboard --logdir=runs` on the terminal. The TensorBoard UI can be found in http://localhost:6006.

In [0]:
!tensorboard --logdir=runs

TensorBoard 1.15.0a20190806 at http://makd0-v1.local:6007/ (Press CTRL+C to quit)
^C


### Hyperparameter Tuning

#### Basic use case: `add_scalar` and `add_histogram`

In [0]:
import torch
from torch.utils.tensorboard import SummaryWriter


# Get data
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST',
    download=False,
    transform=transform.ToTensor())

train_loader = torch.utils.data.DataLoader(train_set, batch_size=100)


# Compile network
network = Network()
optimizer = optim.Adam(network.parameters(), lr=0.001)

# Initialize tensorboard
tb = SummaryWriter() # from torch.utils.tensorboard import SummaryWriter

# Training
for epoch in range(10): 
    total_loss = 0
    total_correct = 0
    
    for batch in train_loader:
        images, labels = batch 
        preds = network(images)
        
        loss = F.cross_entropy(preds, labels) # loss function
        optimizer.zero_grad()                 # set all gradients to zero
        
        loss.backward()         # calculate gradients, training points are supply constants
        optimizer.step()        # update weights to minimize loss (accdg to adam)

        total_loss += loss.item() 
        total_correct += get_num_correct(preds, labels)
    
    tb.add_scalar('Loss', total_loss, epoch)
    tb.add_scalar('Number Correct', total_correct, epoch)
    tb.add_scalar('Accuracy', total_correct / len(train_set), epoch)
    
    tb.add_histogram('conv1.bias', network.conv1.bias, epoch)
    tb.add_histogram('conv1.weight', network.conv1.weight, epoch)
    tb.add_histogram('conv1.weight.grad', network.conv1.weight.grad, epoch)
    print("epoch", epoch, "train_acc", total_correct / 60000, "loss:", total_loss)

tb.close()

epoch 0 train_acc 0.69515 loss: 477.1251989901066
epoch 1 train_acc 0.8025333333333333 loss: 313.45684093236923
epoch 2 train_acc 0.8437 loss: 260.7022297382355
epoch 3 train_acc 0.8593166666666666 loss: 233.27016121149063
epoch 4 train_acc 0.8682 loss: 217.13399057090282
epoch 5 train_acc 0.8745166666666667 loss: 205.5287000834942
epoch 6 train_acc 0.8801666666666667 loss: 196.3009224832058
epoch 7 train_acc 0.8845666666666666 loss: 188.59119561314583
epoch 8 train_acc 0.8886 loss: 182.20187175273895
epoch 9 train_acc 0.8932666666666667 loss: 176.2308282405138


#### Comments. Experimenting with different parameter values.

In [0]:
import torch
from torch.utils.tensorboard import SummaryWriter
from itertools import product

# Get data
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST',
    download=False,
    transform=transform.ToTensor())

def train(lr, batch_size, shuffle, num_epochs=5):
    # data loader
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=shuffle)

    # compile network
    network = Network()
    optimizer = optim.Adam(network.parameters(), lr=lr)

    # Initialize tensorboard
    tb = SummaryWriter(comment=f' lr={lr} batch_size={batch_size} shuffle={shuffle}') # this is appended
    
    # Training
    print('\nlr=', lr, 'batch_size=', batch_size, 'shuffle=', shuffle)
    for epoch in range(num_epochs): 
        total_loss = 0
        total_correct = 0

        for batch in train_loader:
            images, labels = batch 
            preds = network(images)

            loss = F.cross_entropy(preds, labels) 
            
            optimizer.zero_grad()                 
            loss.backward()         
            optimizer.step()        

            total_loss += loss.item()*batch_size             # get absolute loss 
            total_correct += get_num_correct(preds, labels)

        tb.add_scalar('Loss', total_loss, epoch)
        tb.add_scalar('Number Correct', total_correct, epoch)
        tb.add_scalar('Accuracy', total_correct / len(train_set), epoch)

#         tb.add_histogram('conv1.bias', network.conv1.bias, epoch)
#         tb.add_histogram('conv1.weight', network.conv1.weight, epoch)
#         tb.add_histogram('conv1.weight.grad', network.conv1.weight.grad, epoch)

        print("epoch", epoch, "\t train_acc", total_correct / len(train_set), "\t loss:", total_loss)

    tb.close()

In [2]:
import tensorflow

tensorflow.__version__

'1.14.0'

In [0]:
lr_list = [0.1, 0.01, 0.001]
batch_size_list = [10, 100, 1000]
shuffle_list = [True, False]

# hyperparameter grid search
for param in product(lr_list, batch_size_list, shuffle_list):
    train(*param, num_epochs=10)


lr= 0.1 batch_size= 10 shuffle= True
epoch 0 train_acc 0.09998333333333333 loss: 143734.69151377678
epoch 1 train_acc 0.10081666666666667 loss: 139562.04599261284
epoch 2 train_acc 0.09995 loss: 139506.0769557953
epoch 3 train_acc 0.09901666666666667 loss: 139512.84832715988
epoch 4 train_acc 0.09858333333333333 loss: 139588.2374048233
epoch 5 train_acc 0.09991666666666667 loss: 139559.05871391296
epoch 6 train_acc 0.10071666666666666 loss: 139466.00037813187
epoch 7 train_acc 0.09988333333333334 loss: 139555.5360364914
epoch 8 train_acc 0.09948333333333333 loss: 139610.83234071732
epoch 9 train_acc 0.10116666666666667 loss: 139508.01008224487

lr= 0.1 batch_size= 10 shuffle= False
epoch 0 train_acc 0.10206666666666667 loss: 140052.88788318634
epoch 1 train_acc 0.10206666666666667 loss: 139448.9784836769
epoch 2 train_acc 0.10208333333333333 loss: 139448.9783024788
epoch 3 train_acc 0.10208333333333333 loss: 139448.97848844528
epoch 4 train_acc 0.10208333333333333 loss: 139448.9785337

epoch 1 train_acc 0.8563166666666666 loss: 23415.183152332902
epoch 2 train_acc 0.8746333333333334 loss: 20230.23938426515
epoch 3 train_acc 0.88575 loss: 18352.39462211728
epoch 4 train_acc 0.8941166666666667 loss: 17072.932726568542
epoch 5 train_acc 0.8989666666666667 loss: 16131.96897850692
epoch 6 train_acc 0.9049666666666667 loss: 15324.956324783998
epoch 7 train_acc 0.9089666666666667 loss: 14608.77691005764
epoch 8 train_acc 0.9115 loss: 14042.013526456794
epoch 9 train_acc 0.91455 loss: 13612.006936194957

lr= 0.001 batch_size= 100 shuffle= True
epoch 0 train_acc 0.6998166666666666 loss: 47777.79334783554
epoch 1 train_acc 0.8070166666666667 loss: 31040.731501579285
epoch 2 train_acc 0.8384666666666667 loss: 26725.708861649036
epoch 3 train_acc 0.8547833333333333 loss: 24067.706793546677
epoch 4 train_acc 0.8643833333333333 loss: 22141.204045712948
epoch 5 train_acc 0.8735166666666667 loss: 20828.07229757309
epoch 6 train_acc 0.8769 loss: 19997.14133143425
epoch 7 train_acc 0.

__Remarks__:
1. For a large learning rate 0.1, the model does not improve beyond 10% accuracy.
2. We can imitate the `sklearn` API and define `.fit(self, dataset, lr, batch_size, shuffle, num_epochs)` inside an instance of the network.  

### Summary: End-to-end use.

The `SummaryWriter` object writes to the `runs` folder information regarding what are written to it during training. TensorBoard then accesses these log files. The interface can be viewed by entering `tensorboard --logdir=runs` in Terminal.

1. Import `from torch.utils.tensorboard import SummaryWriter`
2. Initialize `tb = SummaryWriter()`
3. Write using `tb.add_scalar`, `tb.add_histogram`, comments, etc. 
4. Close the writer, `tb.close()`
5. View the interface from the terminal `tensorboard --logdir=runs`.

### Early-stopping

We can set a `prev_loss` variable to save the last loss value calculated (best done for validation loss). If the loss increases beyond a set tolerance (the network starts overfitting), we stop the training.