<a href="https://colab.research.google.com/github/colinZejda/Summer2023_UCI_ML_Research/blob/main/1_Writing_ResNet_from_Scratch_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We will create, train and evaluate a ResNet NN for CIFAR-100 image classification

https://blog.paperspace.com/writing-resnet-from-scratch-in-pytorch/



Drawbacks of VGG (visual geometry group), a standard deep CNN

*   Couldn't go very deep because it started to lose generalization capability (started overfitting).
*   As a neural network gets deeper, the gradients from the loss funciton start to shrink to zero (vanishing gradient), and the weights don't get updated.
*   ResNet solves this vanishing gradients problem by using skip connections.
*   With skip connections, the output changes from h(x) = f(wx+b) into h(x) = f(x) + x





CIFAR-10 dataset

*   A common choice for beginner computer vision datasets.
*   It has 60k 32x32 color images in 10 different classes (training: 50k, testing: 10k)

In [None]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
from torchvision import transforms            # provides image transforms
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
# device configuration (use NVIDIA cuda if possible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Loading the dataset
def data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):
  # normalize data to aid in faster convergence, better accuracy
  normalize = transforms.Normalize(
              mean=[0.4914, 0.4822, 0.4465],
              std=[0.2023, 0.1994, 0.2010])

  # define transforms
  # to make the image data workable-- convert to proper size, make it pytorch tensor
  transform = transforms.Compose([
              transforms.Resize((224,224)),
              transforms.ToTensor(),
              normalize])

  if test:             # if testing, not training
      dataset = datasets.CIFAR10(
                root=data_dir, train=False, download=True, transform=transform)

      data_loader = torch.utils.data.DataLoader(
                    dataset, batch_size=batch_size, shuffle=shuffle)

      return data_loader

  # load the dataset
  # strange thing: no difference btwn code for train and valid loading
  train_dataset = datasets.CIFAR10(
        root = data_dir, train=True,
        download = True, transform=transform)

  valid_dataset = datasets.CIFAR10(
        root = data_dir, train=True,
        download=True, transform=transform)

  num_train = len(train_dataset)
  indices = list(range(num_train))
  split = int(np.floor(valid_size * num_train))      # validation size (of 0.1) * number_of_training_images

  if shuffle:
    np.random.seed(42)
    np.random.shuffle(indices)

  train_idx, valid_idx = indices[split:], indices[split:]
  train_sampler = SubsetRandomSampler(train_idx)
  valid_sampler = SubsetRandomSampler(valid_idx)

  train_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size = batch_size, sampler=train_sampler
  )

  valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, sampler=valid_sampler)

  return (train_loader, valid_loader)


In [None]:
# CIFAR10 dataset
train_loader, valid_loader = data_loader(data_dir='./data', batch_size=64)
test_loader = data_loader(data_dir='./data', batch_size=64, test=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


Layers in PyTorch

*   nn.Conv2d: convolutional layers that accept the # of input and output channels, as well as kernel size. It can also accept stride or padding if needed.
*   nn.BatchNorm2d: applies batch normalization to the output from the convolutional layer. Aka: re-centers and re-scales the data to make the training of ANNs faster and more stable.
*   nn.ReLU: a type of activation function applied to various outputs in the network. Max(0, x).
*   nn.MaxPool2d: applies max pooling to the output with kernel size given. Aka: selects the maximum element from the region of the feature map covered by the filter (square of kernel_size^2)
*   nn.Dropout: used to apply dropout to the output with a given probability. Aka: randomly sets input units to 0 (the practice of diregarding certain nodes in a layer at random during training), which helps prevents overfitting by adding noise.
*   nn.Linear: a fully connected layer. Benefit: it is "structure agnostic", meaning no special assumptions are made about the input, and it is most malleable-- this layer can "turn off" various values of the weights during the matrix multiplication
*   nn.Sequential: not a type of layer, but helps in combining different operations that are part of the same step

In [None]:
# RESIDUAL BLOCK, can be re-used throughout the network
# it contains a skip connection that is an optional parameter
    # when adding residual, you can add downsample instead (different path thru the network)
class ResidualBlock(nn.Module):
  def __init__(self, in_channels, out_channels, stride=1, downsample=None):
    super(ResidualBlock, self).__init__()
    self.conv1 = nn.Sequential(
                    nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1),
                    nn.BatchNorm2d(out_channels),
                    nn.ReLU())
    self.conv2 = nn.Sequential(
                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
                    nn.BatchNorm2d(out_channels))
    self.downsample = downsample
    self.relu = nn.ReLU()
    self.out_channels = out_channels

  def forward(self, x):
    residual = x
    out = self.conv1(x)
    out = self.conv2(out)
    if self.downsample:
      residual = self.downsample(x)
    out += residual
    out = self.relu(out)
    return out

In [None]:
# BUILD RESNET
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = 10):
      super(ResNet, self).__init__()        # allows us to avoid referring to base class explicitly (here it's nn.Module)
      self.inplanes = 64
      self.conv1 = nn.Sequential(
                      nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
                      nn.BatchNorm2d(64),
                      nn.ReLU())
      self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
      self.layer0 = self._make_layer(block, 64, layers[0], stride=1)
      self.layer1 = self._make_layer(block, 128, layers[1], stride=2)
      self.layer2 = self._make_layer(block, 256, layers[2], stride=2)
      self.layer3 = self._make_layer(block, 512, layers[3], stride=2)
      self.avgpool = nn.AvgPool2d(7, stride=1)
      self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, blocks, stride=1):
      downsample = None
      if stride != 1 or self.inplanes != planes:         # CONFUSING
        downsample = nn.Sequential(
                        nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                        nn.BatchNorm2d(planes))

      layers = []
      layers.append(block(self.inplanes, planes, stride, downsample))
      self.inplanes = planes
      for i in range(1, blocks):
        layers.append(block(self.inplanes, planes))

      return nn.Sequential(*layers)

    def forward(self, x):
      x = self.conv1(x)
      x = self.maxpool(x)
      x = self.layer0(x)
      x = self.layer1(x)
      x = self.layer2(x)
      x = self.layer3(x)

      x = self.avgpool(x)
      x = x.view(x.size(0), -1)
      x = self.fc(x)

      return x

In [None]:
# Setting Hyperparameters
num_classes = 10
num_epochs = 20
batch_size = 16
learning_rate = 0.01


# instantiate model
model = ResNet(ResidualBlock, [3, 4, 6, 3]).to(device)


# loss and optimzer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)

In [None]:
# TRAINING
import gc
total_step = len(train_loader)

for epoch in range(num_epochs):
  for i, (images, labels) in enumerate(train_loader):

    # move tensors to configured device
    images = images.to(device)
    labels = labels.to(device)

    # forward pass
    outputs = model(images)                # aka y_hat
    loss = criterion(outputs, labels)      # calc loss between y_hat and labels

    # backward and optimize (backprop)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    del images, labels, outputs            # clean up
    torch.cuda.empty_cache()
    gc.collect()            # gc = garbage collector
  print("Epoch[{} / {}], Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))

In [None]:
# VALIDATION (per loop of outer for loop above)
with torch.no_grad():       # don't compute gradient (will make forward passes thru model)
  correct = 0
  total = 0
  for images, labels in valid_loader:
    images = images.to(device)
    labels = labels.to(device)
    ouputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    del images, labels, outputs
  print("Accuracy of the network on the {} validation images: {} %".format(5000, 100 * correct / total))


In [None]:
# TESTING
with torch.no_grad():
  correct = 0
  total = 0
  for images, labels in test_loader:
    images = images.to(device)
    labels = images.to(device)
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().items()
    del images, labels, outputs
  print("Accuracy of the network on the {} test images: {} %".format(10000, 100 * correct / total))