# Homework 3

## 2D conv in matrix form

### 1

Given the 3x3 image

$$
\left[
\begin{matrix}
a_{11} & a_{12} & a_{13}  \\
a_{21} & a_{22} & a_{23}  \\
a_{31} & a_{32} & a_{33}  \\
\end{matrix}
\right]
$$

and a 2 × 2 convolutional kernel/filter

$$
\left[
\begin{matrix}
b_{11} & b_{12}  \\
b_{21} & b_{22}  \\
\end{matrix}
\right]
$$

The 2D convolution’s output is

$$
\left[
\begin{matrix}
a_{11}b_{11} + a_{12}b_{12} + a_{21}b_{21} + a_{22}b_{22}  & a_{12}b_{11} + a_{13}b_{12} + a_{22}b_{21} + a_{23}b_{22}  \\
a_{21}b_{11} + a_{22}b_{12} + a_{31}b_{21} + a_{32}b_{22}  & a_{22}b_{11} + a_{23}b_{12} + a_{32}b_{21} + a_{33}b_{22}  \\
\end{matrix}
\right]
$$

### 2

Express the 2D convolution as matrix-vector multiplication


$$
\left[
\begin{matrix}
a_{11} & a_{12} & a_{21} & a_{22}  \\
a_{12} & a_{13} & a_{22} & a_{23}  \\
a_{21} & a_{22} & a_{31} & a_{32}  \\
a_{22} & a_{23} & a_{32} & a_{33}  \\
\end{matrix}
\right]
\left[
\begin{matrix}
b_{11} \\
b_{12} \\
b_{21} \\
b_{22} \\
\end{matrix}
\right]
$$

### 3
The form of the matrix multiplied to the vector: each row is the patch flatten result

## Transfer learning

### 1. Train AlexNet from scratch

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision import datasets

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device:', device)

device: cuda:0


Download the dataset from Kaggle

In [3]:
!mv kaggle.json /root/.kaggle

In [None]:
# download dataset from kaggle
!kaggle datasets download -d gpiosenka/sports-classification

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [22]:
!cp kaggle.json /root/.kaggle

In [5]:
!unzip sports-classification.zip >> /dev/null

unzip:  cannot find or open sports-classification.zip, sports-classification.zip.zip or sports-classification.zip.ZIP.


In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to the desired size
    transforms.ToTensor(),           # Convert images to tensors
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize images
])

In [None]:
# train data loader and test data loader
dataset = ImageFolder(root='./train', transform=transform)

batch_size = 128
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = ImageFolder(root='./test', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
def train_and_return_test_accuracy(network, epochs=30, lr=0.005):
    """Helper function, train the network on train set, and return test accuracy"""
    # train via storchastic gradient descent
    optimizer = optim.SGD(network.parameters(), lr=lr, momentum=0.9)
    loss_func = nn.CrossEntropyLoss()

    # train
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = network(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch + 1, loss.item()))

    # compute test accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            outputs = network(data)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            correct += (predicted == target).sum().item()
    test_accuracy = correct / total
    return test_accuracy

### 1

In [None]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=False)
# modify the last layer to fit the number of classes 100
model.classifier[6] = nn.Linear(4096, 100)

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip


In [None]:
model = model.to(device)

In [None]:
test_accuracy = train_and_return_test_accuracy(model)
print(f'Test accuracy: {test_accuracy}')

Train Epoch: 10 	Loss: 2.316768
Train Epoch: 20 	Loss: 0.546149
Train Epoch: 30 	Loss: 0.345133
Test accuracy: 18.125


### 2. Transfer learning

Strategy: I have modified the last clasification layer to fit the number of classes 100, and freeze all the feature extraction layers, only train the classification part.

In [None]:
def train_and_return_test_accuracy(network, epochs=5, lr=0.005):
    """Helper function, train the network on train set, and return test accuracy"""
    # train via storchastic gradient descent
    optimizer = optim.SGD(network.parameters(), lr=lr, momentum=0.9)
    loss_func = nn.CrossEntropyLoss()

    # train
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = network(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 1 == 0:
            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch + 1, loss.item()))

    # compute test accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            outputs = network(data)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            correct += (predicted == target).sum().item()
    test_accuracy = correct / total
    return test_accuracy

In [None]:
# transfer learning

model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)
# modify the last layer to fit the number of classes 100
model.classifier[6] = nn.Linear(4096, 100)

# freeze all feature layers
for param in model.features.parameters():
    param.requires_grad = False

model = model.to(device)

test_accuracy = train_and_return_test_accuracy(model)
print(f'Test accuracy: {test_accuracy}')

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Train Epoch: 1 	Loss: 1.176914
Train Epoch: 2 	Loss: 0.853603
Train Epoch: 3 	Loss: 0.233089
Train Epoch: 4 	Loss: 0.190487
Train Epoch: 5 	Loss: 0.087860
Train Epoch: 6 	Loss: 0.432858
Train Epoch: 7 	Loss: 0.057922
Train Epoch: 8 	Loss: 0.079179
Train Epoch: 9 	Loss: 0.049938
Train Epoch: 10 	Loss: 0.000854
Test accuracy: 26.1875


### 3. Transfer learning based on vgg19 and resnet50

1. For vgg19, I have replaced the last layer to fit the number of classes 100, and freeze all the feature extraction layers, only train the classification part.
2. For resnet50, replace the last layer to fit the number of classes 100, I have freeze all the previous layers except the last layer.

In [None]:
import torchvision.models as models

In [None]:
vgg19 = models.vgg19(pretrained=True)
vgg19

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:06<00:00, 94.7MB/s]


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [None]:
# freeze all feature layers
# modify the last layer to fit the number of classes 100
vgg19.classifier[6] = nn.Linear(4096, 100)

# freeze all feature layers
for param in vgg19.features.parameters():
    param.requires_grad = False
vgg19

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [None]:
def train_and_return_test_accuracy(network, epochs=10, lr=0.0001):
    """Helper function, train the network on train set, and return test accuracy"""
    # train via storchastic gradient descent
    optimizer = optim.RMSprop(network.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    # train
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = network(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 1 == 0:
            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch + 1, loss.item()))

    # compute test accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            outputs = network(data)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            correct += (predicted == target).sum().item()
    test_accuracy = correct / total
    return test_accuracy

In [None]:
vgg19 = vgg19.to(device)

test_accuracy = train_and_return_test_accuracy(vgg19)
print(f'Test accuracy: {test_accuracy}')

Train Epoch: 1 	Loss: 15.734919
Train Epoch: 2 	Loss: 4.625505
Train Epoch: 3 	Loss: 4.559018
Train Epoch: 4 	Loss: 4.346425
Train Epoch: 5 	Loss: 4.648879
Test accuracy: 0.6875


In [26]:
resnet50 = models.resnet50(pretrained=True)
resnet50

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 152MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [27]:
# freeze all feature layers
# modify the last layer to fit the number of classes 100
resnet50.fc = nn.Linear(2048, 100)

In [28]:
for param in resnet50.parameters():
    param.requires_grad = False
resnet50

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [37]:
resnet50.fc.requires_grad = True

In [38]:
def train_and_return_test_accuracy(network, epochs=5, lr=0.0001):
    """Helper function, train the network on train set, and return test accuracy"""
    # train via storchastic gradient descent
    optimizer = optim.RMSprop(network.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    # train
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = network(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 1 == 0:
            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch + 1, loss.item()))

    # compute test accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            outputs = network(data)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            correct += (predicted == target).sum().item()
    test_accuracy = correct / total
    return test_accuracy

In [39]:
resnet50 = resnet50.to(device)

test_accuracy = train_and_return_test_accuracy(resnet50)
print(f'Test accuracy: {test_accuracy}')

Train Epoch: 1 	Loss: 2.616593
Train Epoch: 2 	Loss: 1.846568
Train Epoch: 3 	Loss: 1.248962
Train Epoch: 4 	Loss: 0.848985
Train Epoch: 5 	Loss: 0.826501
Test accuracy: 28.3125
