# ML with PyTorch - Transfer Learning

## Workflow
0. Load desired dataset
1. Load pre-trained module
2. Modify trained model's output layer
3. Train

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
# very useful, inline debugging
from IPython.core.debugger import set_trace

## helper functions from Scotte

In [3]:
def get_trainable(model_params):
    return (p for p in model_params if p.requires_grad)

def get_frozen(model_params):
    return (p for p in model_params if not p.requires_grad)

def all_trainable(model_params):
    return all(p.requires_grad for p in model_params)

def all_frozen(model_params):
    return all(not p.requires_grad for p in model_params)

def freeze_all(model_params):
    for param in model_params:
        param.requires_grad = False

## compose transforms

In [4]:
from torchvision import transforms

_image_size = 224
_mean = [0.485, 0.456, 0.406]
_std = [0.229, 0.224, 0.225]

train_trans = transforms.Compose([
    transforms.Resize(256),  # some images are pretty small
    transforms.RandomCrop(_image_size),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(.3, .3, .3),
    transforms.ToTensor(),
    transforms.Normalize(_mean, _std),
])

val_trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(_image_size),
    transforms.ToTensor(),
    transforms.Normalize(_mean, _std),
])

## 0. Lod desired dataset

In [5]:
from torchvision.datasets.folder import ImageFolder

In [11]:
from utils import DogsCatsDataset

In [22]:
train_ds = ImageFolder("data/dogscats/training_set/", transform = train_trans)
val_ds = ImageFolder("data/dogscats/val_set/", transform = val_trans)

batch_size = 32
n_classes = 2

## Torch Dataloader

In [23]:
from torch.utils.data import DataLoader

train_dl = DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=16,
)

val_dl = DataLoader(
    val_ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=16,
)

## 1. Load pre-trained model

In [24]:
from torchvision import models

model = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /home/fredc/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth
100.0%


In [25]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [27]:
import torchsummary

torchsummary.summary(model, (3, 224, 224), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

## 2. Modify last layer of trained model
    - do not want to re-train all those layers -> freeze their gradients

In [31]:
def get_model(model, device, n_classes = 2):
    freeze_all(model.parameters())
    # modify the last layer to have only 2 outputs
    model.fc = nn.Linear(512, n_classes)
    model = model.to(device)
    return model
modified_model = get_model(model, device)
modified_model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

## Loss function - CE

In [32]:
loss_fn = nn.CrossEntropyLoss()

## Optimizer - Adam
    - gradient-based optimization of stochastic objective functions

In [33]:
optimizer = torch.optim.Adam(
    get_trainable(modified_model.parameters()),
    lr=0.001,
    # momentum=0.9,
)

In [38]:
num_epochs = 10

for epoch in range(0, num_epochs):
    # tell PyTorch we are in training mode
    model.train()
    
    total_loss, n_correct, n_samples = 0.0, 0, 0
    for batch_i, (X, y) in enumerate(train_dl):
        X, y = X.to(device), y.to(device)
        
        # PyTorch accumulates the gradients on subsequent backward passes
        # need to zero out gradient at the start so the parameters update correctly
        optimizer.zero_grad()
        
        y_ = model(X)
        loss = loss_fn(y_, y)
        loss.backward()
        optimizer.step()
        
        # statistics
        print(
            f"Epoch {epoch+1}/{num_epochs} |"
            f"  batch: {batch_i} |"
            f"  batch loss:   {loss.item():0.3f}"
        )
        _, y_label_ = torch.max(y_, 1)
        n_correct += (y_label_ == y).sum().item()
        total_loss += loss.item() * X.shape[0]
        n_samples += X.shape[0]

    # statistics
    print(
        f"Epoch {epoch+1}/{num_epochs} |"
        f"  train loss: {total_loss / n_samples:9.3f} |"
        f"  train acc:  {n_correct / n_samples * 100:9.3f}%"
    )
    
    # tell PyTorch we are in evaluation mode
    model.eval()
    
    total_loss, n_correct, n_samples = 0.0, 0, 0

    # do not want to propagate gradient during evaluation
    with torch.no_grad():
        for X, y in val_dl:
            X, y = X.to(device), y.to(device)
            
            y_ = model(X)
            
            # Statistics
            _, y_label_ = torch.max(y_, 1)
            n_correct += (y_label_ == y).sum().item()
            loss = loss_fn(y_, y)
            total_loss += loss.item() * X.shape[0]
            n_samples += X.shape[0]
    
    # Statistics
    print(
        f"Epoch {epoch+1}/{num_epochs} |"
        f"  valid loss: {total_loss / n_samples:9.3f} |"
        f"  valid acc:  {n_correct / n_samples * 100:9.3f}%"
    )

Epoch 1/10 |  batch: 0 |  batch loss:   0.611
Epoch 1/10 |  batch: 1 |  batch loss:   0.504
Epoch 1/10 |  batch: 2 |  batch loss:   0.535
Epoch 1/10 |  batch: 3 |  batch loss:   0.439
Epoch 1/10 |  train loss:     0.545 |  train acc:     73.000%
Epoch 1/10 |  valid loss:     0.431 |  valid acc:     75.000%
Epoch 2/10 |  batch: 0 |  batch loss:   0.594
Epoch 2/10 |  batch: 1 |  batch loss:   0.465
Epoch 2/10 |  batch: 2 |  batch loss:   0.471
Epoch 2/10 |  batch: 3 |  batch loss:   0.393
Epoch 2/10 |  train loss:     0.506 |  train acc:     74.000%
Epoch 2/10 |  valid loss:     0.343 |  valid acc:     90.000%
Epoch 3/10 |  batch: 0 |  batch loss:   0.417
Epoch 3/10 |  batch: 1 |  batch loss:   0.390
Epoch 3/10 |  batch: 2 |  batch loss:   0.373
Epoch 3/10 |  batch: 3 |  batch loss:   0.371
Epoch 3/10 |  train loss:     0.393 |  train acc:     87.000%
Epoch 3/10 |  valid loss:     0.249 |  valid acc:    100.000%
Epoch 4/10 |  batch: 0 |  batch loss:   0.319
Epoch 4/10 |  batch: 1 |  batc