In [2]:
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms
import numpy as np

Sometimes we may need deeper networks so that some features have broader receptive fields, can learn features that are functions of a greater portion of the image

## Padding and Stride

In [3]:
# Function from previous notebook
# Converting to function for future use, default num_workers is 4 bc CPU threads
def load_fashion_mnist(batch_size: int = 512, num_workers: int = 4):
    data_transform = transforms.ToTensor() # Obtaining data to tensor converter
    
    # Downloading data
    mnist_train = torchvision.datasets.FashionMNIST(root = "../data", train = True, transform = data_transform, download= True)  # Defining fashion MNIST train from torch datasets
    mnist_test = torchvision.datasets.FashionMNIST(root = "../data", train = False, transform = data_transform, download = True)
    
    # Loading data onto an iterator
    train_data_loader = data.DataLoader(mnist_train, batch_size, shuffle = True, num_workers = 4)
    test_data_loader = data.DataLoader(mnist_test, batch_size, shuffle = True, num_workers = 4)
    
    # Returning iterator
    return train_data_loader, test_data_loader 

In [4]:
train_loader, test_loader = load_fashion_mnist() # Checking label shape for prediction

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [12]:
for i,x in train_loader:
    print(x[0]) # First label for each batch - label is a single value!

tensor(1)
tensor(9)
tensor(3)
tensor(3)
tensor(2)
tensor(2)
tensor(6)
tensor(5)
tensor(8)
tensor(7)
tensor(3)
tensor(6)
tensor(4)
tensor(7)
tensor(1)
tensor(4)
tensor(1)
tensor(4)
tensor(0)
tensor(5)
tensor(4)
tensor(3)
tensor(9)
tensor(1)
tensor(5)
tensor(8)
tensor(3)
tensor(9)
tensor(1)
tensor(1)
tensor(3)
tensor(4)
tensor(7)
tensor(8)
tensor(5)
tensor(0)
tensor(1)
tensor(4)
tensor(9)
tensor(2)
tensor(1)
tensor(6)
tensor(8)
tensor(5)
tensor(4)
tensor(7)
tensor(4)
tensor(9)
tensor(7)
tensor(9)
tensor(4)
tensor(5)
tensor(1)
tensor(1)
tensor(0)
tensor(5)
tensor(6)
tensor(2)
tensor(0)
tensor(5)
tensor(4)
tensor(8)
tensor(4)
tensor(7)
tensor(9)
tensor(6)
tensor(6)
tensor(3)
tensor(9)
tensor(0)
tensor(9)
tensor(3)
tensor(4)
tensor(6)
tensor(4)
tensor(3)
tensor(2)
tensor(9)
tensor(5)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(2)
tensor(4)
tensor(6)
tensor(1)
tensor(5)
tensor(6)
tensor(8)
tensor(7)
tensor(9)
tensor(5)
tensor(2)
tensor(8)
tensor(2)
tensor(8)
tensor(0)
tensor(0)
tensor(2)


**Padding** - adding a layer of zeros around the perimeter of an image so that, as networks grow deeper and receptive fields grow larger, we do not lose edge information entirely in a cocophony of aggregations that consistently favors the greater number of logits in the image center. Padding adds 0 information (in aggregations it has no effect) yet allows for aggregations that involve explicitly edge information, hence capturing valuable aggregations that have a receptive field of only the image edge.

Change to output dimensionality: by adding a padding layer we add p rows and z columns to the input image, and hence an identical number to the output: xrows - kernel rows + p rows +1 x xcolumns -kernel columns + z columns + 1 (same k-1 unreachable convolutions off ideology for kernel but image now augmented by p rows and z columns)

Padding often symmetrical so as to capture equivalent edge information around the image.

Often choose odd sized Kernels as seek to occasionally preserve dimensionality when aggregating information. How? even len - odd_kernel + 1 = even_new_len, + symmetrical padding (to capture equal edge information) = original_len. Why? Offers benefit that any vlaue in the transformation y[i,j] is cross_corr?(kernel, part of x centered at x[i,j])

In [21]:
# Proof of concept of the above statement - padded tensor
t1 = torch.Tensor([[0,0,0,0,0],[0,1,2,3,0], [0,2,3,4,0], [0,3,4,5,0], [0,0,0,0,0]])
kernel = torch.Tensor([[0,1,2], [0,1,2], [0,1,2]]) # Right information most important

In [19]:
def cross_cor(t1: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor:
    """Applies a kernel filter across a tensor, producing an aggregated latent representation"""
    # New shape is how far short of endpoint filter must stop (because it cannot exceed boundaries of image) + 1 for the stopping iteration (if it is sized 2, it still aggregates when 2 away from edge)
    new_shape = t1.shape[0] - kernel.shape[0] + 1, t1.shape[1] - kernel.shape[1] + 1
    # A latent representation of aggregated localities
    latent_representation = torch.rand(new_shape[0], new_shape[1]) # Using image dimensions
    # Iterating and cross correlating
    for i in range(new_shape[0]):
        for j in range(new_shape[1]):
            # Will dot tensors together via multiplication operator - this is the default
            # Obtaining locality, using stride of 1, of sized kernel. Convoluted along
            inter_tensor = t1[i:i+ kernel.shape[0], j:j+kernel.shape[1]]
            # Obtaining cross correlation of locality with kernel (using kernel to weigh sum of information in locality)
            weighted_locality_rep = (inter_tensor * kernel).sum() # Aggregating weighted feature information (reduces latent space rather than just weights)
            latent_representation[i][j] = weighted_locality_rep
    return latent_representation

In [24]:
t1, kernel, cross_cor(t1, kernel) # Padding ompensates for 1/2 Kernel height, rendering every right shifted element centered
# t1 + p(len(k)/2) * k (crosses 1/2 of p, either kernel present or padding allows kernel to be centered initially)
# This is a functional benefit as we know that input centric edge features are directly convoluted to the output image, with identical dimensionality preserved, 
# requires: p = k-1 so then n-k+1 + (k-1) = n = original dimensions
# When Kernel not a square matrix, can set different padding values on height and width

(tensor([[0., 0., 0., 0., 0.],
         [0., 1., 2., 3., 0.],
         [0., 2., 3., 4., 0.],
         [0., 3., 4., 5., 0.],
         [0., 0., 0., 0., 0.]]),
 tensor([[0., 1., 2.],
         [0., 1., 2.],
         [0., 1., 2.]]),
 tensor([[13., 19.,  7.],
         [24., 33., 12.],
         [19., 25.,  9.]]))

In [52]:
class ConvNetPadding(torch.nn.Module):
    def __init__(self, *dims):
        # Weigh + aggregate information twice by convolving it with a localized weighting kernel, then map to output space
        # Call to parent constructor initializes modules
        super().__init__()
        self._modzules["layer_1"] = torch.nn.Conv2d(1,1, kernel_size = (14,14), padding = (1,1))
        self._modules["layer_2"] = torch.nn.Conv2d(1,1, kernel_size = (10,10), padding = (1,1)) 
        # Previous error: thought labels were 10 class probabilities, turns out we want concrete prediction
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(100,10))
        
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """Instance method defining computational mapping to output space"""
        X = torch.nn.functional.relu(self._modules["layer_1"](X))
        X = torch.nn.functional.relu(self._modules["layer_2"](X))
        # Convolving, breaking linearity, flattening, mapping to output space
        X = self._modules["remaining_network"](X)
        # Returning result
        return X

In [53]:
# Applying init to model to initialize all layer weights
def build_convnet():
    # Kernels are iterating across all three dimensions and all batch sizes simultaneously
    model = ConvNetPadding()
    trainer = torch.optim.Adam(model.parameters(), lr = 0.005)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

In [54]:
model, optimizer, cost = build_convnet()

In [55]:
def train_model(model: torch.nn.Module, optimizer, cost, train_loader, epochs: int = 5):
    for epoch in range(epochs):
        for data, labels, in train_loader: # Returns data, label tuple
            optimizer.zero_grad()
            
            # Obtaining cross entropy cost
            loss = cost(model(data), labels)
            
            # Displaying for first epoch
            if epoch == 0:
                print("cost: ", loss)
                
            # Resetting gradient
            # Computing gradients
            loss.backward()
            # Displaying cost every 10 iterations
            optimizer.step()
        
        # Printing cost every 10th epoch
        print("cost: ", loss)

In [56]:
train_model(model, optimizer, cost, train_loader, 5) # Accuracy much better

cost:  tensor(2.3071, grad_fn=<NllLossBackward>)
cost:  tensor(2.3009, grad_fn=<NllLossBackward>)
cost:  tensor(2.2959, grad_fn=<NllLossBackward>)
cost:  tensor(2.2916, grad_fn=<NllLossBackward>)
cost:  tensor(2.2831, grad_fn=<NllLossBackward>)
cost:  tensor(2.2829, grad_fn=<NllLossBackward>)
cost:  tensor(2.2658, grad_fn=<NllLossBackward>)
cost:  tensor(2.2509, grad_fn=<NllLossBackward>)
cost:  tensor(2.2708, grad_fn=<NllLossBackward>)
cost:  tensor(2.2399, grad_fn=<NllLossBackward>)
cost:  tensor(2.2319, grad_fn=<NllLossBackward>)
cost:  tensor(2.2248, grad_fn=<NllLossBackward>)
cost:  tensor(2.2076, grad_fn=<NllLossBackward>)
cost:  tensor(2.1902, grad_fn=<NllLossBackward>)
cost:  tensor(2.1850, grad_fn=<NllLossBackward>)
cost:  tensor(2.1495, grad_fn=<NllLossBackward>)
cost:  tensor(2.1392, grad_fn=<NllLossBackward>)
cost:  tensor(2.1012, grad_fn=<NllLossBackward>)
cost:  tensor(2.0740, grad_fn=<NllLossBackward>)
cost:  tensor(2.0226, grad_fn=<NllLossBackward>)
cost:  tensor(1.9835

In [57]:
# Major performance boost with padding - edge features likely valueable sufficiently to the point where a kernel more centered around edge information (reduction of 14x14 is drastic and loses half of image) is more valuable

Let's see if even more padding contributes!

In [84]:
class ConvNetPadding(torch.nn.Module):
    def __init__(self, *dims):
        # Weigh + aggregate information twice by convolving it with a localized weighting kernel, then map to output space
        # Call to parent constructor initializes modules
        super().__init__()
        self._modules["layer_1"] = torch.nn.Conv2d(1,1, kernel_size = (14,14), padding = (2,2)) # Reduction to 20x20 localized features with edge info, 1 off padding
        self._modules["layer_2"] = torch.nn.Conv2d(1,1, kernel_size = (10,10), padding = (2,2)) # Reduction to 14x14 localized features with edge info
        self._modules["layer_3"] = torch.nn.Conv2d(1,1, kernel_size = (9,9), padding = (2,2)) # Receptive field now does not aggregate away edge info entirely
        
        # Previous error: thought labels were 10 class probabilities, turns out we want concrete prediction
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(100,10))
        
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """Instance method defining computational mapping to output space"""
        X = torch.nn.functional.relu(self._modules["layer_1"](X))
        X = torch.nn.functional.relu(self._modules["layer_2"](X))
        X = torch.nn.functional.relu(self._modules["layer_3"](X))
        # Convolving, breaking linearity, flattening, mapping to output space
        X = self._modules["remaining_network"](X)
        # Returning result
        return X

In [85]:
model, optimizer, cost = build_convnet()

In [87]:
train_model(model, optimizer, cost, train_loader, 5) # Model appears to perform better with better edge features

cost:  tensor(0.5682, grad_fn=<NllLossBackward>)
cost:  tensor(0.5926, grad_fn=<NllLossBackward>)
cost:  tensor(0.5861, grad_fn=<NllLossBackward>)
cost:  tensor(0.6680, grad_fn=<NllLossBackward>)
cost:  tensor(0.6007, grad_fn=<NllLossBackward>)
cost:  tensor(0.5969, grad_fn=<NllLossBackward>)
cost:  tensor(0.6198, grad_fn=<NllLossBackward>)
cost:  tensor(0.5877, grad_fn=<NllLossBackward>)
cost:  tensor(0.5558, grad_fn=<NllLossBackward>)
cost:  tensor(0.6123, grad_fn=<NllLossBackward>)
cost:  tensor(0.6031, grad_fn=<NllLossBackward>)
cost:  tensor(0.5890, grad_fn=<NllLossBackward>)
cost:  tensor(0.6380, grad_fn=<NllLossBackward>)
cost:  tensor(0.5951, grad_fn=<NllLossBackward>)
cost:  tensor(0.5767, grad_fn=<NllLossBackward>)
cost:  tensor(0.6086, grad_fn=<NllLossBackward>)
cost:  tensor(0.7384, grad_fn=<NllLossBackward>)
cost:  tensor(0.5012, grad_fn=<NllLossBackward>)
cost:  tensor(0.6455, grad_fn=<NllLossBackward>)
cost:  tensor(0.5381, grad_fn=<NllLossBackward>)
cost:  tensor(0.5320

In [93]:
def test_model(model: torch.nn.Module, cost, test_loader):
    total_loss = torch.Tensor([0])
    # Computing #batches
    batch_num = 0
    for data, labels, in test_loader: # Returns data, label tuple
        batch_num+=1
        # Ensuring gradient is not computed
        with torch.no_grad():
            # Obtaining cross entropy cost
            loss = cost(model(data), labels)
            total_loss += loss.sum()
    return total_loss/batch_num

In [94]:
test_model(model, cost, test_loader) # Scales pretty nicely to test set

tensor([0.5569])

## Stride

We use stride to downsample (view sparsely sampled parts of image) or reduce latent shape for the sake of computational efficiency

Can stride in different shapes vertically and horizontally, reducing latent dimensions in uneven manner. The reduction to a dimension nk = (nk - k + p + s) / s = (nk-k+p / s) + 1 as stride does not change initial "fit", so the if an array has 6 elements with a stride of 2, kernel of 2, padding of 2: 6 - 2 + 2 / 2 + 1 = 4. This is because stride evenly divides the number of elements (downsampling by 1/k), padding adds m elements to nk, and kernels subtract k-1 elements away - hence the need for the +s/s or the +1 in order to compensate for the fact that kernels enumerate their last fit.

In [95]:
# Utilizing stride in a neural network: attempt
class ConvNetPaddingStride(torch.nn.Module):
    def __init__(self, *dims):
        # Weigh + aggregate information twice by convolving it with a localized weighting kernel, then map to output space
        # Call to parent constructor initializes modules
        super().__init__()
        self._modules["layer_1"] = torch.nn.Conv2d(1,1, kernel_size = (14,14), padding = (2,2)) # Reduction to 20x20 localized features with edge info, 1 off padding
        self._modules["layer_2"] = torch.nn.Conv2d(1,1, kernel_size = (8,8), padding = (2,2)) # Reduction to 16x16 localized features with edge info
        self._modules["layer_3"] = torch.nn.Conv2d(1,1, kernel_size = (5,5), padding = (2,2), stride = (2,2)) # Receptive field now does not aggregate away edge info entirely
        
        # Previous error: thought labels were 10 class probabilities, turns out we want concrete prediction
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(64,10))
        
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """Instance method defining computational mapping to output space"""
        X = torch.nn.functional.relu(self._modules["layer_1"](X))
        X = torch.nn.functional.relu(self._modules["layer_2"](X))
        X = torch.nn.functional.relu(self._modules["layer_3"](X))
        # Convolving, breaking linearity, flattening, mapping to output space
        X = self._modules["remaining_network"](X)
        # Returning result
        return X

In [97]:
# Applying init to model to initialize all layer weights
def build_convnet_stride():
    # Kernels are iterating across all three dimensions and all batch sizes simultaneously
    model = ConvNetPaddingStride()
    trainer = torch.optim.Adam(model.parameters(), lr = 0.005)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

In [98]:
model, optimizer, cost = build_convnet_stride()

In [99]:
train_model(model, optimizer, cost, train_loader, 5) # In Downsampling data in last layer, we are likely downsampling some features, hence training a worse NN.

cost:  tensor(2.3062, grad_fn=<NllLossBackward>)
cost:  tensor(2.3029, grad_fn=<NllLossBackward>)
cost:  tensor(2.3035, grad_fn=<NllLossBackward>)
cost:  tensor(2.2981, grad_fn=<NllLossBackward>)
cost:  tensor(2.3017, grad_fn=<NllLossBackward>)
cost:  tensor(2.2994, grad_fn=<NllLossBackward>)
cost:  tensor(2.2860, grad_fn=<NllLossBackward>)
cost:  tensor(2.2746, grad_fn=<NllLossBackward>)
cost:  tensor(2.2495, grad_fn=<NllLossBackward>)
cost:  tensor(2.2227, grad_fn=<NllLossBackward>)
cost:  tensor(2.1881, grad_fn=<NllLossBackward>)
cost:  tensor(2.1469, grad_fn=<NllLossBackward>)
cost:  tensor(2.0479, grad_fn=<NllLossBackward>)
cost:  tensor(2.0112, grad_fn=<NllLossBackward>)
cost:  tensor(1.9069, grad_fn=<NllLossBackward>)
cost:  tensor(1.7843, grad_fn=<NllLossBackward>)
cost:  tensor(1.7078, grad_fn=<NllLossBackward>)
cost:  tensor(1.6416, grad_fn=<NllLossBackward>)
cost:  tensor(1.5398, grad_fn=<NllLossBackward>)
cost:  tensor(1.4760, grad_fn=<NllLossBackward>)
cost:  tensor(1.3184