# Training a model part 2 - loss, gradient, backprop

In [14]:
# Now let's move on to actually training data. 

# The training process:
# 1) Get batch from training set
# 2) Pass batch to network
# 3) Calculate the loss
# 4) Get the gradient of loss vs network weights
# 5) Update the weights according to this gradient to reduce loss. 

# 6) Repeat 1-5 for one epoch (one pass through full training set)
# 7) Complete 1-6 for appropriate number of epochs. 

In [15]:
# We already know steps 1 and 2. What about the loss function? 
import torch
import math
import numpy as np
import torchvision # computer vision package 
import torchvision.transforms as transforms # interface for common IP transforms. 
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms 
torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x121405e10>

In [16]:
# alright. Back to implementing the network. 
import torch.nn.functional as F

class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5)
        self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 12, kernel_size = 5)
        
        self.fc1 = nn.Linear(in_features = 12*4*4, out_features = 120) # "fully connected". Also, "dense"
        self.fc2 = nn.Linear(in_features = 120, out_features = 60)
        self.out = nn.Linear(in_features = 60, out_features = 10)
        
    def forward(self, t):
        # time to actually implement forward, layer by layer
        # 1) input layer
        t = t
        
        # 2) conv1 - 3 operations. Convolution, activation, and pooling. 
        t = self.conv1(t) # that easy - we just call the layer obejct on our input! Neat. 
        t = F.relu(t) # activation function
        t = F.max_pool2d(t, kernel_size = 2, stride = 2) # pool the convolution output 
        
        # 3) conv2
        t = self.conv2(t) # that easy - we just call the layer obejct on our input! Neat. 
        t = F.relu(t) 
        t = F.max_pool2d(t, kernel_size = 2, stride = 2)
        
        # 4) fc1
        t = t.reshape(-1, 12 * 4 * 4) # when we swap from conv to linear, we need to reshape
        # 12 is number of output channels from previous layer
        # 4x4 is height and with of each of the channels, determined by previous pooling operations. 
        t = self.fc1(t)
        t = F.relu(t)
        
        # 5) fc2
        t = self.fc2(t)
        t = F.relu(t)

        # 6) out 
        t = self.out(t)
#         t.softmax(t, dim=1) # change to percentages. Different activation function
        
        return t

In [17]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum() # number of correct guesses

In [18]:
# LOAD
train_set = torchvision.datasets.FashionMNIST(
    root = './data/FashionMNIST',
    train = True,
    download = True,
    transform = transforms.Compose([ # This is the transform. Easy. 
        transforms.ToTensor() # TRANSFORM
    ])
)

In [19]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 100)

In [20]:
batch = next(iter(train_loader))
images, labels = batch

# Calculating Loss

In [67]:
network = Network()
preds = network(images)

In [68]:
loss = F.cross_entropy(preds, labels)

In [69]:
loss.item() # summed differences of predictions from labels 

2.3082938194274902

In [70]:
for i in range(len(preds)):
    loss_i = abs(preds[i].max().item() - labels[i])
    if i%10==0:
        print("{}) cl: {}".format(i, loss_i))

0) cl: 8.872130393981934
10) cl: 0.12297342717647552
20) cl: 2.8746678829193115
30) cl: 4.868659973144531
40) cl: 5.863234043121338
50) cl: 2.876112937927246
60) cl: 4.870473384857178
70) cl: 2.8767216205596924
80) cl: 0.8737058043479919
90) cl: 8.871868133544922


In [71]:
# we have our loss. Now, what about gradients?
print(network.conv1.weight.grad)

None


In [72]:
loss.backward()

In [73]:
print(network.conv1.weight.grad)

tensor([[[[-1.0213e-03, -6.0217e-04, -3.5090e-04, -6.6514e-04, -1.6521e-03],
          [-6.5744e-04, -3.2261e-04, -3.8479e-04, -7.4669e-04, -1.7587e-03],
          [-7.6745e-04, -3.9491e-04, -4.6891e-04, -1.0875e-03, -1.7523e-03],
          [-8.4279e-04, -4.1597e-04, -2.8424e-04, -1.0495e-03, -1.8093e-03],
          [-3.8704e-04, -5.6582e-04, -6.3969e-04, -1.5271e-03, -1.8251e-03]]],


        [[[-3.5149e-04, -3.5799e-05,  5.0507e-06, -4.1301e-06,  8.2016e-07],
          [-2.0818e-04,  7.2200e-06, -2.9325e-06, -6.7616e-06,  2.4076e-06],
          [-1.5251e-04,  1.3392e-06, -5.3205e-06, -1.1740e-05,  1.5879e-05],
          [-1.4451e-04, -4.7039e-06,  4.7099e-06,  3.9046e-05, -1.1307e-04],
          [-7.1386e-05,  1.0393e-05,  2.0262e-05, -3.8235e-05, -4.3980e-04]]],


        [[[ 1.1552e-03,  2.9186e-04, -4.0394e-04, -2.4013e-04, -6.2014e-04],
          [ 8.4670e-04,  4.1739e-04, -3.2125e-04, -5.0695e-04, -5.2845e-04],
          [ 8.3865e-04,  4.0994e-04, -6.7180e-05, -7.4328e-04, -6.43

In [74]:
# Nice backprop is done, and gradients for each weight tensor in the grad attribute have been updated
# step 4 done

# Update The Weights

In [75]:
# update weights according to gradients
# we need an optimizer to do this. 
optimizer = optim.Adam(network.parameters(), lr=.01) # need to pass the actual weights we want updated. 
# these weights live in the network's parameters instance. 
# lr is the learning rate. Dictates the size of the "leap" each learning step takes 

get_num_correct(preds, labels)

tensor(4)

In [76]:
optimizer.step() # take a step

In [77]:
preds = network(images)

In [78]:
get_num_correct(preds, labels)
loss = F.cross_entropy(preds, labels)

In [79]:
loss.item()

2.2794179916381836

In [80]:
loss.backward()
optimizer.step()
preds = network(images)
loss = F.cross_entropy(preds, labels)

In [81]:
loss.item() # bada bing bada boom. 

2.234861373901367

In [82]:
get_num_correct(preds, labels)

tensor(15)

In [92]:
# now in a loop. 
i = 0
network = Network()
optimizer = optim.Adam(network.parameters(), lr=.01) # need to pass the actual weights we want updated. 

preds = network(images)
loss = F.cross_entropy(preds, labels)

while i < 500:
    i+=1
    if i%50==0:
        preds = network(images)
        loss = F.cross_entropy(preds, labels)
        print("i:{}\n\t\tloss: {}".format(i, loss.item()))
        print("\t\tnum_correct: {}".format(get_num_correct(preds, labels)))
        loss.backward()
        optimizer.step()
        
# here, we jump over a local minimum a few times. check out how num_correct goes up and down


i:50
		loss: 2.303581476211548
		num_correct: 11
i:100
		loss: 2.2861835956573486
		num_correct: 11
i:150
		loss: 2.261082410812378
		num_correct: 12
i:200
		loss: 2.2494986057281494
		num_correct: 12
i:250
		loss: 2.198390483856201
		num_correct: 16
i:300
		loss: 2.1417696475982666
		num_correct: 21
i:350
		loss: 2.114436388015747
		num_correct: 33
i:400
		loss: 2.068920373916626
		num_correct: 37
i:450
		loss: 1.9670729637145996
		num_correct: 36
i:500
		loss: 1.8496395349502563
		num_correct: 29


In [93]:
# Now let's try it with a bunch of different batches - this is what is actually relevant

# Epoch Training Loop

In [100]:
network = Network()
optimizer = optim.Adam(network.parameters(), lr=.01) # need to pass the actual weights we want updated. 

total_loss = 0
total_correct = 0

for batch in train_loader:
    images, labels = batch
    preds = network(images)
    loss = F.cross_entropy(preds, labels)

    total_loss += loss.item()
    total_correct += get_num_correct(preds, labels)
    
    optimizer.zero_grad() # clean up the gradients before calculating new ones with each step
    loss.backward()
    optimizer.step()
    
print("epoch:", 0, "total_correct:", total_correct, "loss:", total_loss)

epoch: 0 total_correct: tensor(47745) loss: 324.13429021835327


In [104]:
total_correct.item()/len(train_set) # not bad, not bad at all
# changing the batch size means that we change the number of steps we can take via the learning step. 

0.79575