Refs:

https://github.com/deep-learning-with-pytorch/dlwpt-code

In [1]:
import numpy as np
import torch
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn as nn

torch.set_printoptions(edgeitems=2)
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7ff9e8cd13b0>

### Data

In [3]:
data_path = '../../data/'

cifar10 = datasets.CIFAR10(data_path, train=True, download=False,
        transform=transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))]))

In [4]:
cifar10_val = datasets.CIFAR10(data_path, train=False, download=False,
        transform=transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))]))

In [5]:
class_names = cifar10.classes
class_names

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [6]:
## create a new data containing only airplane and bird
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']
cifar2 = [(img, label_map[label]) for img, label in cifar10 if label in [0, 2]]
cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0, 2]]

### Test

In [7]:
## a simple model with 3 layers
import torch.nn as nn

In [8]:
## 3: input features/pixel (the RGB channels), 16: arbitrary number of channels in the output, kernel_size=(3,3)
conv = nn.Conv2d(3, 16, kernel_size=3)
conv

Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))

In [9]:
## number of weights and biases
conv.weight.shape, conv.bias.shape

(torch.Size([16, 3, 3, 3]), torch.Size([16]))

In [10]:
## output
img, _ = cifar2[0]
print(img.shape)

output = conv(img.unsqueeze(0))
print(img.unsqueeze(0).shape)
print(output.shape)

torch.Size([3, 32, 32])
torch.Size([1, 3, 32, 32])
torch.Size([1, 16, 30, 30])


In [11]:
## padding the boundary: 
conv = nn.Conv2d(3, 16, kernel_size=3, padding=1)
output = conv(img.unsqueeze(0))
print(img.unsqueeze(0).shape)
print(output.shape)

torch.Size([1, 3, 32, 32])
torch.Size([1, 16, 32, 32])


In [12]:
## detecting features
## set `bias` to zero and `weights` to a constant value so that each pixel in the output 
## gets the mean of its neighbors. For each 3x3 neighborhood:

with torch.no_grad():
    conv.bias.zero_()
    
with torch.no_grad():
    conv.weight.fill_(1.0/9.0) # 9 = 3*3

#### From large to small: Downsampling

In [13]:
## max pooling: take the maximum of the four pixels
pool = nn.MaxPool2d(2)  # 2x2 --> 1
output = pool(img.unsqueeze(0))

img.unsqueeze(0).shape, output.shape

(torch.Size([1, 3, 32, 32]), torch.Size([1, 3, 16, 16]))

### CNN Model

In [14]:
model = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),  ## 3 channels --> 16 independent features
            nn.Tanh(),                                   ## Tanh activation function
            nn.MaxPool2d(2),                             ## 32x32 --> 16x16
            nn.Conv2d(16, 8, kernel_size=3, padding=1),  ## 8-channel 16x16
            nn.Tanh(),
            nn.MaxPool2d(2),                             ## 8 channel 16x16 --> 8 channel 8x8
            # ... 
            nn.Linear(8 * 8 * 8, 32),                    ## convert 8 channel 8x8 --> 1D vector of 32 features
            nn.Tanh(),
            nn.Linear(32, 2))

In [15]:
numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

(18090, [432, 16, 1152, 8, 16384, 32, 64, 2])