# Transfer Learning

```text
- Transfer learning is a machine learning technique where a model trained on a source task is reused as the starting point for a model on a related target task. 
- This can be done by freezing the weights of the pretrained model and then fine-tuning them on the target task.

Benefits
--------
- It can improve the performance of machine learning models on a variety of tasks.
- It can be especially useful when there is limited data available for the target task.
- It can save time and resources by reusing a pretrained model.

Challenges
----------
- The pretrained model may not be suitable for the target task.
- The pretrained model may need to be fine-tuned on the target task, which can be time-consuming.
- The pretrained model may not be available for the specific task at hand.
```

In [22]:
# Built-in library
import copy
import logging
from typing import Any, Optional, Sequence, Union

# Standard imports
import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import DataLoader, TensorDataset, random_split
from torchvision.datasets import FashionMNIST, MNIST, CIFAR10
from torchvision import datasets
import torchvision.transforms as T
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# Configure the backend
import matplotlib_inline.backend_inline

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 2_000

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
import seaborn as sns

# Custom import
from src.utilities import (
    set_up_logger,
    create_iris_data,
    create_qwerties_data,
    smooth,
)
from src.data_manager import (
    load_data,
    create_data_loader,
    split_into_train_n_validation,
)
from src.preprocessor import Standardizer, Normalizer


# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Transfer The Weights of MNSIT Digits Model To FMNIST Model

<br>

#### Load Data

In [2]:
# MNIST Data
fp = "../../data/mnist_digit"

# Chain multiple transformations
transform = T.Compose(
    [
        # Convert to PyTorch tensors
        T.ToTensor(),
        #  It performs per-channel normalization, where each channel
        # (e.g., red, green, blue for an RGB image) is normalized independently.
        # Since it's a single channel, we have (0.5,)
        T.Normalize(mean=(0.5,), std=(0.5,)),
    ]
)

train_data = MNIST(root=fp, train=True, transform=transform, download=True)
test_data = MNIST(root=fp, train=False, transform=transform, download=True)

train_size = int(0.8 * len(train_data))  # 80% of training data
val_size = len(train_data) - train_size  # 20% of training data
batch_size = 64

# Split the training dataset into training and validation sets
train_dataset, val_dataset = random_split(
    dataset=train_data,
    lengths=[train_size, val_size],
)

# Create DataLoader for each dataset
train_loader_mnist = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader_mnist = DataLoader(val_dataset, batch_size=batch_size)
test_loader_mnist = DataLoader(test_data, batch_size=batch_size)

In [3]:
# Fashion MNIST Data
fp = "../../data/fashion_mnist"

# Chain multiple transformations
transform = T.Compose(
    [
        # Convert to PyTorch tensors
        T.ToTensor(),
        #  It performs per-channel normalization, where each channel
        # (e.g., red, green, blue for an RGB image) is normalized independently.
        # Since it's a single channel, we have (0.5,)
        T.Normalize(mean=(0.5,), std=(0.5,)),
    ]
)

train_data = FashionMNIST(root=fp, train=True, transform=transform, download=True)
test_data = FashionMNIST(root=fp, train=False, transform=transform, download=True)

train_size = int(0.8 * len(train_data))  # 80% of training data
val_size = len(train_data) - train_size  # 20% of training data
batch_size = 64

# Split the training dataset into training and validation sets
train_dataset, val_dataset = random_split(
    dataset=train_data,
    lengths=[train_size, val_size],
)

# Create DataLoader for each dataset
train_loader_f_mnist = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader_f_mnist = DataLoader(val_dataset, batch_size=batch_size)
test_loader_f_mnist = DataLoader(test_data, batch_size=batch_size)

In [4]:
train_data.data.shape

torch.Size([60000, 28, 28])

In [5]:
# Set random seeds for reproducibility
RANDOM_STATE = 123

torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)


# Each MNSIT image is 1x28x28, so it is an 2D array [28,28]
# I'll flatten the image as vector dim=1*28*28
input_size = 1 * 28 * 28
num_classes = 10
num_epochs = 5
batch_size = 64
learning_rate = 0.001

<hr><br>

### Steps For Calculating The Output of The Layer 

#### Formula For The Size of Image At The Current Layer

$$ N = \lfloor{(\frac{M + 2p - k}{S})}\rfloor + 1$$

```text
N = Number of pixels in the current layer.
M = Number of pixels in the previous layer.
2p = Padding (it's multiplied by 2 because it's symmetric)
K = Number of pixels in the kernel/filter.
S = Stride parameter
⌊⌋ = floor (i.e 2.2 becomes 2.0)
```
<br>

```python
class CNNModel(nn.Module):
    def __init__(self, display_shape: bool = False) -> None:
        super().__init__()
        self.display_shape = display_shape
        # Conv layer
        self.conv_1 = nn.Conv2d(
            in_channels=1, out_channels=10, kernel_size=5, stride=1, padding=1
        )
        # Size of the output of the conv layer (equation is shown above)
        # Step 1
        # NB: 28 (this is the size of the image. i.e 28X28)
        # size_1 = np.floor((28 + (2*1) - 5) / 1 ) + 1 = 26/2 = 13 (/2 cox of maxpool)

        self.conv_2 = nn.Conv2d(
            in_channels=10, out_channels=20, kernel_size=5, stride=1, padding=1
        )
        # Step 2
        # NB: 13 (this is the output of conv_1. i.e size_1)
        # size_2 = np.floor((13 + (2 * 1) - 5) / 1) + 1 = 11/2 = 5 (/2 cox of maxpool)

        # ========== Fully Connected Layer ==========
        # Compute the number of units in the FCLayer (i.e. output of conv_2)
        # NB: This FCLayer has no padding or stride
        expected_size = 5 # (size_2)
        # Find the square since it's symmetrical
        expected_size = 20 * int(np.square(expected_size))
        self.fc_layer_3 = nn.Linear(expected_size, 50)

        # ========== Output Layer ==========
        self.output = nn.Linear(50, 10)

```

#### Steps

```text
Step 1: Calculate the output of conv_1
    - size_1 = np.floor((28 + (2*1) - 5) / 1 ) + 1 = 26/2 = 13 (/2 cox of maxpool)

Step 2: Calculate the output of conv_2
    - size_2 = np.floor((13 + (2 * 1) - 5) / 1) + 1 = 11/2 = 5 (/2 cox of maxpool)

Step 3: Calculate the input of the fully connected layer
    - input_size = size_2 * size_2
      self.fc_layer_3 = nn.Linear(input_size x n_channels, 50)

```

<br>



### Create MNIST Model

```text
- The model weights will be transferred to another model (FashionMNIST model)
```

In [6]:
def _calculate_size(
    image_input: int, padding: int, kernel_size: int, stride: int
) -> int:
    """This is used to calculate the output of the CNN."""
    output = np.floor(((image_input + 2 * padding - kernel_size) / stride)) + 1
    return np.floor(output / 2)


def calculate_layer_input(
    image_input: int, padding: int, kernel_size: int, stride: int
) -> int:
    """This is used to calculate the input of the fully connected layer in a CNN."""
    step_1 = _calculate_size(
        image_input=image_input, padding=padding, kernel_size=kernel_size, stride=stride
    )
    step_2 = _calculate_size(
        image_input=step_1, padding=padding, kernel_size=kernel_size, stride=stride
    )
    output = int(step_2)
    return output


class MNISTModel(nn.Module):
    """This is used to build a Convolutional Neutral Network architecture that
    is used for classification of the MNIST data.

    Params:
        input_size: This is the output of the final conv layer.
        num_classes (int): This is the number of class labels in the input data.
    """

    def __init__(self, input_size: int, num_classes: int) -> None:
        super().__init__()
        self.conv_1 = nn.Conv2d(
            in_channels=1, out_channels=10, kernel_size=5, stride=1, padding=1
        )
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv_2 = nn.Conv2d(
            in_channels=10, out_channels=20, kernel_size=5, stride=1, padding=1
        )
        self.fc_1 = nn.Linear((20 * input_size * input_size), 50)
        self.fc_2 = nn.Linear(50, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """This performs the forward propagation."""

        # Conv -> pool
        x = self.pool(F.relu(self.conv_1(x)))
        x = self.pool(F.relu(self.conv_2(x)))

        # Flatten the input images to have the dim:
        # (num_samples, n_channels, n_input, n_input)
        x = x.view(x.size(0), -1)
        x = self.fc_1(x)
        x = self.fc_2(x)

        return x

In [7]:
# Test the model with random data
# (1_000, 1, 28, 28): (n_samples, n_channels, image_size, image_size)
conv_output_size = calculate_layer_input(
    image_input=28, padding=1, kernel_size=5, stride=1
)
print(f"Output of Conv Layer: {conv_output_size}\n")
cnn_model = MNISTModel(input_size=conv_output_size, num_classes=num_classes)
X_ = torch.rand(size=(1_000, 1, 28, 28))

result = cnn_model.forward(x=X_)
result.shape

Output of Conv Layer: 5



torch.Size([1000, 10])

### Train The MNIST Model

In [8]:
def train(
    model: MNISTModel,
    device: Any,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion: Any,
    optimizer: torch.optim,
    num_epochs: int,
) -> MNISTModel:
    """This is used for training the model."""
    for epoch in tqdm(range(num_epochs)):
        model.train()
        running_loss = 0.0

        # ==== Batch training loop ====
        for images, labels in train_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)

            # ==== Forwardprop ====
            outputs = model(images)
            loss: nn.CrossEntropyLoss = criterion(outputs, labels)

            # ==== Backprop ====
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Update the loss
            running_loss += loss.item()

        # ==== Validation loop ====
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                # Push the data to GPU if available
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                val_loss = criterion(outputs, labels).item()
                # It returns the value and the index.
                # We're interested in the index
                _, predicted = torch.max(outputs, dim=1)
                val_total += labels.size(0)  # or labels.shape[0]
                val_correct += (predicted == labels).sum().item()
        val_accuracy = (val_correct / val_total) * 100

        print(
            f"Epoch {epoch + 1}/{num_epochs}, "
            f"Training Loss: {running_loss / len(train_loader)}, "
            f"Validation Loss: {val_loss / len(val_loader)}, "
            f"Validation Accuracy: {val_accuracy:.2f}%"
        )
    return model


def test(model: MNISTModel, device: Any, test_loader: DataLoader):
    """This is used to the model on the test dataset."""
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = torch.argmax(outputs, dim=1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_accuracy = (test_correct / test_total) * 100
    print(f"Test Accuracy: {test_accuracy:.2f}%")

In [9]:
def main():
    """This is the main function."""
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If we're on a CUDA machine, this should print a CUDA device:
    print(f"Working on device={device!r}")

    # ==== Init model ====
    model = MNISTModel(
        input_size=conv_output_size,
        num_classes=num_classes,
    ).to(device=device)

    # ==== Define loss function and optimizer ====
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    # ==== Train the model ====
    model = train(
        model,
        device=device,
        train_loader=train_loader_mnist,
        val_loader=val_loader_mnist,
        criterion=criterion,
        optimizer=optimizer,
        num_epochs=num_epochs,
    )

    # ==== Evaluate the model ====
    test(model, device=device, test_loader=test_loader_mnist)
    return model

In [10]:
# Train and evaluate
mnist_model = main()

Working on device=device(type='cpu')


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Training Loss: 0.22236178043608865, Validation Loss: 0.0001227181464275147, Validation Accuracy: 97.52%
Epoch 2/5, Training Loss: 0.06364271127882724, Validation Loss: 5.3120033934395366e-05, Validation Accuracy: 98.12%
Epoch 3/5, Training Loss: 0.045613968019373716, Validation Loss: 2.933987862172913e-05, Validation Accuracy: 98.32%
Epoch 4/5, Training Loss: 0.03791267141296218, Validation Loss: 3.084124244273977e-05, Validation Accuracy: 98.81%
Epoch 5/5, Training Loss: 0.03114697021866838, Validation Loss: 9.347980673563608e-06, Validation Accuracy: 98.64%
Test Accuracy: 99.07%


### Test The Model

```text
- Test the MNIST model on the Fashion MNIST data.
```

In [11]:
# Extract X, y from FASHION test dataloader
X, y = next(iter(test_loader_f_mnist))
y_proba = mnist_model(X)
y_pred = torch.argmax(y_proba, axis=1)
fashion_acc = 100 * (torch.mean((y_pred == y).float()))

print(f"MNIST Digits model performance on FASHION data: {fashion_acc:.2f}%")

MNIST Digits model performance on FASHION data: 15.62%


In [12]:
mnist_model

MNISTModel(
  (conv_1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
  (fc_1): Linear(in_features=500, out_features=50, bias=True)
  (fc_2): Linear(in_features=50, out_features=10, bias=True)
)

### 1. Fine-tune The Model with One Training Batch

In [13]:
# Note:
# The first print function will show a non-zeros matrix because the weights of the two models differ.
# The second print function shows the zeros matrix because the two models have identical weights.

# Create the target model
fashion_net = MNISTModel(input_size=conv_output_size, num_classes=num_classes)
matrix = fashion_net.conv_1.weight[0] - mnist_model.conv_1.weight[0]
print(f"1st (Non-zero): \n{matrix}")

# Then replace all the weights in TARGET model from SOURCE model
for source, target in zip(
    mnist_model.named_parameters(), fashion_net.named_parameters()
):
    target[1].data = copy.deepcopy(source[1].data)

matrix = fashion_net.conv_1.weight[0] - mnist_model.conv_1.weight[0]

print("=" * 80)
print(f"\n2nd (Zero-matrix): \n{matrix}")

1st (Non-zero): 
tensor([[[-5.0515e-02, -3.0436e-01,  4.1554e-02, -2.9614e-01,  8.6978e-02],
         [-3.1389e-01, -2.1803e-01, -2.6253e-01,  3.0337e-01, -5.2898e-02],
         [ 1.9131e-01, -1.5062e-01,  4.3029e-03, -6.6835e-02,  1.5565e-01],
         [-7.6559e-02,  7.2208e-03, -1.4116e-01,  1.2844e-01, -5.6489e-03],
         [ 2.3686e-04, -2.1828e-01, -2.2357e-01,  1.6261e-01,  2.0629e-01]]],
       grad_fn=<SubBackward0>)

2nd (Zero-matrix): 
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], grad_fn=<SubBackward0>)


In [14]:
summary(mnist_model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 10, 26, 26]             260
         MaxPool2d-2           [-1, 10, 13, 13]               0
            Conv2d-3           [-1, 20, 11, 11]           5,020
         MaxPool2d-4             [-1, 20, 5, 5]               0
            Linear-5                   [-1, 50]          25,050
            Linear-6                   [-1, 10]             510
Total params: 30,840
Trainable params: 30,840
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.12
Estimated Total Size (MB): 0.21
----------------------------------------------------------------


In [15]:
summary(fashion_net, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 10, 26, 26]             260
         MaxPool2d-2           [-1, 10, 13, 13]               0
            Conv2d-3           [-1, 20, 11, 11]           5,020
         MaxPool2d-4             [-1, 20, 5, 5]               0
            Linear-5                   [-1, 50]          25,050
            Linear-6                   [-1, 10]             510
Total params: 30,840
Trainable params: 30,840
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.12
Estimated Total Size (MB): 0.21
----------------------------------------------------------------


#### Re-train The Copied Model (fashion_net) 
text
```
- Re-train the copied model using the Fashion MNIST data.
```

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=fashion_net.parameters(), lr=learning_rate)

fashion_net = train(
    model=fashion_net,
    device=device,
    train_loader=train_loader_f_mnist,
    val_loader=val_loader_f_mnist,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Training Loss: 0.5470346232652664, Validation Loss: 0.0015083234360877503, Validation Accuracy: 86.42%
Epoch 2/5, Training Loss: 0.345637819459041, Validation Loss: 0.0010313773725895172, Validation Accuracy: 88.69%
Epoch 3/5, Training Loss: 0.3061181615094344, Validation Loss: 0.0011674218830910134, Validation Accuracy: 88.84%
Epoch 4/5, Training Loss: 0.2849910481572151, Validation Loss: 0.0007020186712133124, Validation Accuracy: 89.35%
Epoch 5/5, Training Loss: 0.2678710467616717, Validation Loss: 0.0008637752938777842, Validation Accuracy: 89.16%


In [17]:
# ==== Evaluate the model ====
test(fashion_net, device=device, test_loader=test_loader_f_mnist)

Test Accuracy: 88.45%


### 2. Train Only The Output Layer

In [18]:
# ==== Create the target model ====
fashion_net = MNISTModel(input_size=conv_output_size, num_classes=num_classes)

# ==== Replace all the weights in TARGET model from SOURCE model ====
for source, target in zip(
    mnist_model.named_parameters(), fashion_net.named_parameters()
):
    target[1].data = copy.deepcopy(source[1].data)

# Adjust the number of output units (Final layer)
fashion_net.fc_2 = nn.Linear(50, num_classes)

# ==== Freeze convolution and batch-norm layers ====
for _param in fashion_net.named_parameters():
    if ("conv" in _param[0]) or ("bnorm" in _param[0]):
        _param[1].requires_grad = False

In [19]:
# ==== Retrain the model ====

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=fashion_net.parameters(), lr=learning_rate)

fashion_net = train(
    model=fashion_net,
    device=device,
    train_loader=train_loader_f_mnist,
    val_loader=val_loader_f_mnist,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Training Loss: 0.5369026172955831, Validation Loss: 0.001318495403578941, Validation Accuracy: 85.13%
Epoch 2/5, Training Loss: 0.3918050659298897, Validation Loss: 0.0014470904431444533, Validation Accuracy: 86.53%
Epoch 3/5, Training Loss: 0.3650636683603128, Validation Loss: 0.0013575189291162694, Validation Accuracy: 87.18%
Epoch 4/5, Training Loss: 0.3521836158931255, Validation Loss: 0.0010353727068038697, Validation Accuracy: 86.97%
Epoch 5/5, Training Loss: 0.3446938945055008, Validation Loss: 0.000949744610710347, Validation Accuracy: 86.99%


In [20]:
# ==== Evaluate the model ====
test(fashion_net, device=device, test_loader=test_loader_f_mnist)

Test Accuracy: 85.73%


## Load State of The Art Models And Fine-tune The Model.

<br>

### 1. ResNet

In [25]:
import torchvision.models as models


# Fashion MNIST Data
fp = "../../data/stl_10"

# Chain multiple transformations
transform = T.Compose(
    [
        # Convert to PyTorch tensors
        T.ToTensor(),
        #  It performs per-channel normalization, where each channel
        # (e.g., red, green, blue for an RGB image) is normalized independently.
        # Since it's a single channel, we have (0.5,)
        T.Normalize(mean=(0.5,), std=(0.5,)),
    ]
)

train_data = datasets.STL10(root=fp, split="train", transform=transform, download=True)
test_data = datasets.STL10(root=fp, split="test", transform=transform, download=True)

train_size = int(0.8 * len(train_data))  # 80% of training data
val_size = len(train_data) - train_size  # 20% of training data
batch_size = 64

# Split the training dataset into training and validation sets
train_dataset, val_dataset = random_split(
    dataset=train_data,
    lengths=[train_size, val_size],
)

# Create DataLoader for each dataset
train_loader_stl_10 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader_stl_10 = DataLoader(val_dataset, batch_size=batch_size)
test_loader_stl_10 = DataLoader(test_data, batch_size=batch_size)

Downloading http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz to ../../data/stl_10/stl10_binary.tar.gz


100%|██████████| 2640397119/2640397119 [18:28<00:00, 2381542.09it/s]


Extracting ../../data/stl_10/stl10_binary.tar.gz to ../../data/stl_10
Files already downloaded and verified


In [41]:
train_data.data.shape

(5000, 3, 96, 96)

In [26]:
# Load and inspect this network
resnet_18 = models.resnet18(pretrained=True)
resnet_18

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/neidu/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:11<00:00, 4.00MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [40]:
summary(resnet_18.to(device), input_size=(3, 96, 96))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 48, 48]           9,408
       BatchNorm2d-2           [-1, 64, 48, 48]             128
              ReLU-3           [-1, 64, 48, 48]               0
         MaxPool2d-4           [-1, 64, 24, 24]               0
            Conv2d-5           [-1, 64, 24, 24]          36,864
       BatchNorm2d-6           [-1, 64, 24, 24]             128
              ReLU-7           [-1, 64, 24, 24]               0
            Conv2d-8           [-1, 64, 24, 24]          36,864
       BatchNorm2d-9           [-1, 64, 24, 24]             128
             ReLU-10           [-1, 64, 24, 24]               0
       BasicBlock-11           [-1, 64, 24, 24]               0
           Conv2d-12           [-1, 64, 24, 24]          36,864
      BatchNorm2d-13           [-1, 64, 24, 24]             128
             ReLU-14           [-1, 64,

In [44]:
# Define transfer learning class
class TransferLearningModel(nn.Module):
    """This is used to load the pre-trained ResNet model."""

    def __init__(self, num_classes: int) -> None:
        super().__init__()
        self.model = models.resnet18(pretrained=True)

        # Freeze all the layers in the pre-trained model
        for param in self.model.parameters():
            param.requires_grad = False

        # Modify the last fully connected layer for the new task (classification)
        in_features = self.model.fc.in_features
        self.model.fc = nn.Linear(in_features=in_features, out_features=num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """This performs the forward propagation."""
        return self.model(x)

In [48]:
# Test the model with random data
# (1_000, 1, 28, 28): (n_samples, n_channels, image_size, image_size)
conv_output_size = calculate_layer_input(
    image_input=28, padding=1, kernel_size=5, stride=1
)
print(f"Output of Conv Layer: {conv_output_size}\n")
transf_model = TransferLearningModel(num_classes=num_classes)
X_ = torch.rand(size=(1_000, 3, 96, 96))

result = transf_model.forward(x=X_)
result.shape

Output of Conv Layer: 5



torch.Size([1000, 10])

In [49]:
def train_transfer_learning(
    model: TransferLearningModel,
    device: Any,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion: Any,
    optimizer: torch.optim,
    num_epochs: int,
) -> TransferLearningModel:
    """This is used for training the pre-trained model."""
    for epoch in tqdm(range(num_epochs)):
        model.train()
        running_loss = 0.0

        # ==== Batch training loop ====
        for images, labels in train_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)

            # ==== Forwardprop ====
            outputs = model(images)
            loss: nn.CrossEntropyLoss = criterion(outputs, labels)

            # ==== Backprop ====
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Update the loss
            running_loss += loss.item()

        # ==== Validation loop ====
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                # Push the data to GPU if available
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                val_loss = criterion(outputs, labels).item()
                # It returns the value and the index.
                # We're interested in the index
                _, predicted = torch.max(outputs, dim=1)
                val_total += labels.size(0)  # or labels.shape[0]
                val_correct += (predicted == labels).sum().item()
        val_accuracy = (val_correct / val_total) * 100

        print(
            f"Epoch {epoch + 1}/{num_epochs}, "
            f"Training Loss: {running_loss / len(train_loader)}, "
            f"Validation Loss: {val_loss / len(val_loader)}, "
            f"Validation Accuracy: {val_accuracy:.2f}%"
        )
    return model


def test(model: TransferLearningModel, device: Any, test_loader: DataLoader):
    """This is used to the model on the test dataset."""
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = torch.argmax(outputs, dim=1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_accuracy = (test_correct / test_total) * 100
    print(f"Test Accuracy: {test_accuracy:.2f}%")

In [50]:
def main():
    """This is the main function."""
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If we're on a CUDA machine, this should print a CUDA device:
    print(f"Working on device={device!r}")

    # ==== Init model ====
    model = TransferLearningModel(num_classes=num_classes).to(device=device)

    # ==== Define loss function and optimizer ====
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    # ==== Train the model ====
    model = train_transfer_learning(
        model,
        device=device,
        train_loader=train_loader_stl_10,
        val_loader=val_loader_stl_10,
        criterion=criterion,
        optimizer=optimizer,
        num_epochs=num_epochs,
    )

    # ==== Evaluate the model ====
    test(model, device=device, test_loader=test_loader_stl_10)
    return model

In [51]:
main()

Working on device=device(type='cpu')


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Training Loss: 1.416323531241644, Validation Loss: 0.06494010984897614, Validation Accuracy: 70.80%
Epoch 2/5, Training Loss: 0.8007853191996378, Validation Loss: 0.05132629722356796, Validation Accuracy: 75.10%
Epoch 3/5, Training Loss: 0.6694839894771576, Validation Loss: 0.04335065186023712, Validation Accuracy: 76.90%
Epoch 4/5, Training Loss: 0.5988558440927475, Validation Loss: 0.04236869513988495, Validation Accuracy: 76.70%
Epoch 5/5, Training Loss: 0.5514242029379285, Validation Loss: 0.040829259902238846, Validation Accuracy: 77.70%
Test Accuracy: 78.35%


TransferLearningModel(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=Tru

<hr><br>

### 2. VggNet (Visual Geometry Group Network)

```text
- The model size is ~600MB so I won't download the pre-trained model.
- I also will NOT fine-tune the model due to the model size.
- The code below is a code snippet on how to fine-tune the model.
```

<br>

```python
# ==== Note ====:
# The model size is ~600MB so I won't download the pre-trained model.
vggnet = models.vgg16(pretrained=True)
vggnet

# The last fully connected layer
vggnet.classifier[6].in_features

# Define transfer learning class
class TransferLearningModel(nn.Module):
    """This is used to load the pre-trained VggNet model."""

    def __init__(self, num_classes: int) -> None:
        super().__init__()
        self.model = models.vgg16(pretrained=True)

        # Freeze all the layers in the pre-trained model
        for param in self.model.parameters():
            param.requires_grad = False

        # Modify the last fully connected layer for the new task (classification)
        in_features = self.model.classifier[6].in_features
        self.model.classifier[6] = nn.Linear(
            in_features=in_features, out_features=num_classes
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """This performs the forward propagation."""
        return self.model(x)

def main():
    """This is the main function."""
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If we're on a CUDA machine, this should print a CUDA device:
    print(f"Working on device={device!r}")

    # ==== Init model ====
    model = TransferLearningModel(num_classes=num_classes).to(device=device)

    # ==== Define loss function and optimizer ====
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    # ==== Train the model ====
    model = train_transfer_learning(
        model,
        device=device,
        train_loader=train_loader_stl_10,
        val_loader=val_loader_stl_10,
        criterion=criterion,
        optimizer=optimizer,
        num_epochs=num_epochs,
    )

    # ==== Evaluate the model ====
    test(model, device=device, test_loader=test_loader_stl_10)
    return model

    
# ==== Fine-tune the model ====
main()
```