In [37]:
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms
import numpy as np

In [38]:
data_transform = transforms.ToTensor() # Automated data to tensor conversion

In [41]:
# Bugfix to disable ssl checking 
import ssl 
ssl._create_default_https_context = ssl._create_unverified_context 

In [244]:
def load_cifar_data(batch_s:int = 512, data_transform = None):
    """Loads CIFAR10 Data"""
    
    # Loading data
    cifar_train = torchvision.datasets.CIFAR10(root = "../data", train = True, transform = data_transform, download = True)
    # Defining DataLoader object
    cifar_train_loader = data.DataLoader(cifar_train, batch_s, shuffle = True, num_workers = 2)
    cifar_test = torchvision.datasets.CIFAR10(root = "../data", train = False, transform = data_transform, download = True)
    cifar_test_loader = data.DataLoader(cifar_test, batch_s, shuffle = False, num_workers = 2)

    return cifar_train_loader, cifar_test_loader


In [245]:
cifar_train, cifar_test = load_cifar_data(512, data_transform)


Files already downloaded and verified
Files already downloaded and verified


In [206]:
# Adding Line numbers from now on for ease of use

When we have multiple input channels as in CIFAR, we can find different weighting across them to aggregate into a single output channel, or we can allow output channels to be maintained, using information from all 3 to form more "detailed" features

In [260]:
class MultiChannelConvNet(torch.nn.Module):
    """Defines MultiChannel Convolutional NN to be trained on CIFAR 10 label prediction"""
    def __init__(self):
        super().__init__()
        # 3 dimensional convolutioal layer  - learning latent representations as useful to enumerate distinct useful features across channels and proceed to aggregate them
        self._modules["first_layer"] = torch.nn.Conv2d(3, 5, padding  = 2, kernel_size = (10,10), stride = 3) # 9x9x5
        # Must specify padding for every dim
        self._modules["second_layer"] = torch.nn.Conv2d(5, 5, padding  = 2, kernel_size = (9,9)) # 5 * 5 * 5
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(125,45), torch.nn.ReLU(), torch.nn.Linear(45,15), torch.nn.ReLU(), torch.nn.Linear(15,10))
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        # Defining multichannel computation
        X = torch.nn.functional.relu(self._modules["first_layer"](X))
        X = torch.nn.functional.relu(self._modules["second_layer"](X))
        X = self._modules["remaining_network"](X)
        return X
        
        

In [261]:
# Applying init to model to initialize all layer weights
def build_convnet():
    # Kernels are iterating across all three dimensions and all batch sizes simultaneously
    model = MultiChannelConvNet()
    trainer = torch.optim.Adam(model.parameters(), lr = 0.005)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

In [262]:
model, optimizer, cost = build_convnet() 

In [263]:
model # Initial mapping from 3 to 1 channels, then 1 -> 1

MultiChannelConvNet(
  (first_layer): Conv2d(3, 5, kernel_size=(10, 10), stride=(3, 3), padding=(2, 2))
  (second_layer): Conv2d(5, 5, kernel_size=(9, 9), stride=(1, 1), padding=(2, 2))
  (remaining_network): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=125, out_features=45, bias=True)
    (2): ReLU()
    (3): Linear(in_features=45, out_features=15, bias=True)
    (4): ReLU()
    (5): Linear(in_features=15, out_features=10, bias=True)
  )
)

In [264]:
def train_model(model: torch.nn.Module, optimizer, cost, train_loader, epochs: int = 5):
    for epoch in range(epochs):
        # Using training loader reorganizes with channel first
        for data, labels in train_loader: # Returns data, label tuple
            optimizer.zero_grad()
            
            # Obtaining cross entropy cost
            loss = cost(model(data), labels)
                
            # Resetting gradient
            # Computing gradients
            loss.backward()
            # Displaying cost every 10 iterations
            optimizer.step()
        
        # Printing end of epoch
        print("cost: ", loss)

In [265]:
train_model(model, optimizer, cost, cifar_train, 5) # Training architecture for 5 epochs - reasonably improves on training data

cost:  tensor(1.8996, grad_fn=<NllLossBackward>)
cost:  tensor(1.7116, grad_fn=<NllLossBackward>)
cost:  tensor(1.6793, grad_fn=<NllLossBackward>)
cost:  tensor(1.5457, grad_fn=<NllLossBackward>)
cost:  tensor(1.6389, grad_fn=<NllLossBackward>)


1D Kernels are used to compute localized aggregations of features across input channels, yielding aggregated "channel feature" data

In [282]:
# Attempting different architrecture
class MultiChannelConvNet(torch.nn.Module):
    """Defines MultiChannel Convolutional NN to be trained on CIFAR 10 label prediction"""
    def __init__(self):
        super().__init__()
        # 3 dimensional convolutioal layer  - learning latent representations as useful to enumerate distinct useful features across channels and proceed to aggregate them
        self._modules["first_layer"] = torch.nn.Conv2d(3, 5,padding = 2, kernel_size = (15,15), stride = 2) # 11* 11 **5
        self._modules["second_layer"] = torch.nn.Conv2d(5, 1, kernel_size = (1,1)) # 11*11*1
        # Must specify padding for every dim
        self._modules["third_layer"] = torch.nn.Conv2d(1,1, padding  = 2, kernel_size = (16,16)) # 11 * 11
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(121,15), torch.nn.ReLU(), torch.nn.Linear(15,10))
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        # Defining multichannel computation
        X = torch.nn.functional.relu(self._modules["first_layer"](X))
        X = torch.nn.functional.relu(self._modules["second_layer"](X))
        X = torch.nn.functional.relu(self._modules["third_layer"](X))
        X = self._modules["remaining_network"](X)
        return X
        

In [283]:
model, optimizer, cost = build_convnet() 

In [284]:
train_model(model, optimizer, cost, cifar_train, 5) # Training architecture for 5 epochs - This architecture seems to perform far worse on CIFAR

cost:  tensor(2.3045, grad_fn=<NllLossBackward>)
cost:  tensor(2.3025, grad_fn=<NllLossBackward>)
cost:  tensor(2.3028, grad_fn=<NllLossBackward>)
cost:  tensor(2.3027, grad_fn=<NllLossBackward>)
cost:  tensor(2.3027, grad_fn=<NllLossBackward>)


## Pooling

Intuitively, aggregating set of learned localized feature into a global question - goal of pool is to take all intermediary features within a receptive field and aggregate them into a single digit indicative of something. Coarser maps / aggregations = global representations

Downsample and aggregate, also lower sensativity to location

Whereas a Kernel computes a stochastic weighting function, the pooling window is parameterless, performing deterministic aggregations across the training data

A pxq pooling layer is a sliding window, computing the maxium or average of the subtensor in that specific position. 

This is positionally impartial as (with a max pool for example) no matter where a specific feature is, you will obtain the same output so long as it is within the locality window. Common intuition is images have tendency to deform merely because of movements of the camera, so insensitivity to minor shifts (likely the same deterministic aggregation so long as within a more general locality) should help recognize images.

In [413]:
# Testing empirically - remember to adjust stride as identical by default
class MultiChannelConvNetPool(torch.nn.Module):
    """Defines MultiChannel Convolutional NN to be trained on CIFAR 10 label prediction"""
    def __init__(self):
        super().__init__()
        # 3 dimensional convolutioal layer  - learning latent representations as useful to enumerate distinct useful features across channels and proceed to aggregate them
        self._modules["first_layer"] = torch.nn.Conv2d(3, 25, padding  = 2, kernel_size = (10,10), stride = 5) # 6x6x25
        # Must specify padding for every dim
        self._modules["second_layer"] = torch.nn.Conv2d(25,30, padding =2, kernel_size = (5,5), stride = 1)
        self._modules["third_layer"] = torch.nn.MaxPool2d(2, padding =1, stride = 2) #4x4x35
        # Skipping2, taking Maxpool of 3x3 Kernel window, returns 5x5x15
        self._modules["remaining_network"] = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(270,100), torch.nn.ReLU(), torch.nn.Linear(100,10))
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        # Defining multichannel computation
        X = torch.nn.functional.relu(self._modules["first_layer"](X))
        X = torch.nn.MaxPool2d(2, padding=1, stride = 2)(X)
        X = torch.nn.functional.relu(self._modules["second_layer"](X))
        # Not taking ReLU of what is already a linear linear transformation from ReLU space
        X = self._modules["third_layer"](X)
        X = self._modules["remaining_network"](X)
        return X

In [414]:
# Applying init to model to initialize all layer weights
def build_convnet_pool():
    # Kernels are iterating across all three dimensions and all batch sizes simultaneously
    model = MultiChannelConvNetPool()
    trainer = torch.optim.Adam(model.parameters(), lr = 0.005)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

In [415]:
model, optimizer, cost = build_convnet_pool() 

In [416]:
# Model should - have deep featured latent space, reduce sensitivity to movement, learn at proper rate, be semi-deep
train_model_pool(model, optimizer, cost, cifar_train, 5) # Training architecture for 5 epochs - Slight improvment by increasing latent space

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


cost:  tensor(2.3044, grad_fn=<NllLossBackward>)
cost:  tensor(2.3018, grad_fn=<NllLossBackward>)
cost:  tensor(2.2991, grad_fn=<NllLossBackward>)
cost:  tensor(2.2884, grad_fn=<NllLossBackward>)
cost:  tensor(2.2762, grad_fn=<NllLossBackward>)
cost:  tensor(2.3035, grad_fn=<NllLossBackward>)
cost:  tensor(2.2580, grad_fn=<NllLossBackward>)
cost:  tensor(2.2733, grad_fn=<NllLossBackward>)
cost:  tensor(2.2601, grad_fn=<NllLossBackward>)
cost:  tensor(2.2348, grad_fn=<NllLossBackward>)
cost:  tensor(2.2052, grad_fn=<NllLossBackward>)
cost:  tensor(2.1713, grad_fn=<NllLossBackward>)
cost:  tensor(2.1312, grad_fn=<NllLossBackward>)
cost:  tensor(2.1115, grad_fn=<NllLossBackward>)
cost:  tensor(2.1246, grad_fn=<NllLossBackward>)
cost:  tensor(2.1669, grad_fn=<NllLossBackward>)
cost:  tensor(2.4176, grad_fn=<NllLossBackward>)
cost:  tensor(2.0640, grad_fn=<NllLossBackward>)
cost:  tensor(2.2219, grad_fn=<NllLossBackward>)
cost:  tensor(2.2374, grad_fn=<NllLossBackward>)
cost:  tensor(2.1173

cost:  tensor(1.5767, grad_fn=<NllLossBackward>)
cost:  tensor(1.6137, grad_fn=<NllLossBackward>)
cost:  tensor(1.5826, grad_fn=<NllLossBackward>)
cost:  tensor(1.6311, grad_fn=<NllLossBackward>)
cost:  tensor(1.6458, grad_fn=<NllLossBackward>)
cost:  tensor(1.6605, grad_fn=<NllLossBackward>)
cost:  tensor(1.5853, grad_fn=<NllLossBackward>)
cost:  tensor(1.5641, grad_fn=<NllLossBackward>)
cost:  tensor(1.6554, grad_fn=<NllLossBackward>)
cost:  tensor(1.5308, grad_fn=<NllLossBackward>)
cost:  tensor(1.6439, grad_fn=<NllLossBackward>)
cost:  tensor(1.5244, grad_fn=<NllLossBackward>)
cost:  tensor(1.5829, grad_fn=<NllLossBackward>)
cost:  tensor(1.5494, grad_fn=<NllLossBackward>)
cost:  tensor(1.5187, grad_fn=<NllLossBackward>)
cost:  tensor(1.5198, grad_fn=<NllLossBackward>)
cost:  tensor(1.5412, grad_fn=<NllLossBackward>)
cost:  tensor(1.5475, grad_fn=<NllLossBackward>)
cost:  tensor(1.5207, grad_fn=<NllLossBackward>)
cost:  tensor(1.5423, grad_fn=<NllLossBackward>)
cost:  tensor(1.4785

cost:  tensor(1.4071, grad_fn=<NllLossBackward>)
cost:  tensor(1.5186, grad_fn=<NllLossBackward>)
cost:  tensor(1.5234, grad_fn=<NllLossBackward>)
cost:  tensor(1.4675, grad_fn=<NllLossBackward>)
cost:  tensor(1.4620, grad_fn=<NllLossBackward>)
cost:  tensor(1.5319, grad_fn=<NllLossBackward>)
cost:  tensor(1.4300, grad_fn=<NllLossBackward>)
cost:  tensor(1.5020, grad_fn=<NllLossBackward>)
cost:  tensor(1.4234, grad_fn=<NllLossBackward>)
cost:  tensor(1.4238, grad_fn=<NllLossBackward>)
cost:  tensor(1.5726, grad_fn=<NllLossBackward>)
cost:  tensor(1.5074, grad_fn=<NllLossBackward>)
cost:  tensor(1.4532, grad_fn=<NllLossBackward>)
cost:  tensor(1.4098, grad_fn=<NllLossBackward>)
cost:  tensor(1.4357, grad_fn=<NllLossBackward>)
cost:  tensor(1.4348, grad_fn=<NllLossBackward>)
cost:  tensor(1.3852, grad_fn=<NllLossBackward>)
cost:  tensor(1.3721, grad_fn=<NllLossBackward>)
cost:  tensor(1.3715, grad_fn=<NllLossBackward>)
cost:  tensor(1.4236, grad_fn=<NllLossBackward>)
cost:  tensor(1.3833

Applying MaxPooling (computing defining features) and computing many latent features across the same localities helped the most! Pooling after every layer greatly helped improve stability.

For multiple channels, maintain separation of localized aggregations irrespective of position across channels

In [289]:
from torchvision.models import AlexNet

In [301]:
def train_model_pool(model: torch.nn.Module, optimizer, cost, train_loader, epochs: int = 5):
    for epoch in range(epochs):
        # Using training loader reorganizes with channel first
        for data, labels in train_loader: # Returns data, label tuple
            optimizer.zero_grad()
            
            # Obtaining cross entropy cost
            loss = cost(model(data), labels)
                
            # Resetting gradient
            # Computing gradients
            loss.backward()
            # Displaying cost every 10 iterations
            optimizer.step()
        
            # Printing end of batch
            print("cost: ", loss)

In [292]:
AlexNet()

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 