# Problem 5 - Vision Transformers (20 points)

In [1]:
import os
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import timm

# Load data
def load_data_transformer(data_dir):
    # Load a pre-trained Swin Transformer
    model_name = "swin_tiny_patch4_window7_224"
    transformer = timm.create_model(model_name, pretrained=True)
    # Remove the final classification head
    transformer = torch.nn.Sequential(*(list(transformer.children())[:-1]))

    # Define image transformations
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    features = []
    labels = []
    label_to_int = {}
    current_label_int = 0

    for filename in os.listdir(data_dir):
        if filename.endswith('.jpg'):
            image_path = os.path.join(data_dir, filename)
            image = Image.open(image_path).convert('RGB')
            image = data_transforms(image)
            image = image.unsqueeze(0)  # Add a batch dimension

            with torch.no_grad():
                feature = transformer(image).numpy().flatten()  # Extract features

            features.append(feature)
            label = filename.split('.')[0].split('_')[0]

            if label not in label_to_int:
                label_to_int[label] = current_label_int
                current_label_int += 1

            labels.append(label_to_int[label])

    features = normalize(features, axis=1)  # Normalize the features
    return np.array(features), np.array(labels), label_to_int

# Load the dataset and extract features
data_dir = 'images/'
features, labels, label_to_int = load_data_transformer(data_dir)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert numpy arrays to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [6]:
# Define the PyTorch linear classifier
class LinearClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

# Create and train the linear classifier
num_classes = len(np.unique(labels))
input_size = X_train.shape[1]
classifier = LinearClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]

        optimizer.zero_grad()
        outputs = classifier(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

classifier.eval()
with torch.no_grad():
    test_outputs = classifier(X_test)
    _, predicted = torch.max(test_outputs, 1)
    correct = (predicted == y_test).sum().item()

mean_per_class_accuracy = correct / y_test.size(0)
print(f"Mean-per-class accuracy: {mean_per_class_accuracy * 100:.2f}%")

Epoch [10/100], Loss: 0.08153478056192398
Epoch [20/100], Loss: 0.02065344713628292
Epoch [30/100], Loss: 0.007428711745887995
Epoch [40/100], Loss: 0.002986220410093665
Epoch [50/100], Loss: 0.0012519218726083636
Epoch [60/100], Loss: 0.0005350172286853194
Epoch [70/100], Loss: 0.00023170700296759605
Epoch [80/100], Loss: 0.00010199053213000298
Epoch [90/100], Loss: 4.6120385377435014e-05
Epoch [100/100], Loss: 2.1884019588469528e-05
Mean-per-class accuracy: 94.05%


The Swin Transformer is a hierarchical Vision Transformer that introduces several improvements over the original Vision Transformer (ViT) design. Key enhancements include shifted window-based self-attention and a local relative position bias. The selected model, swin_tiny_patch4_window7_224, is a small version of the Swin Transformer architecture, which offers reduced computational complexity and fewer parameters while maintaining competitive performance. Swin Transformers are generally expected to outperform CNNs like ResNet-50 on large-scale datasets, as they can better capture long-range dependencies and learn hierarchical representations in input images.

The Vision Transformer, specifically the Swin Transformer, achieved a higher mean-per-class accuracy of 94.05% compared to the CNN (ResNet-50) at 74.3%. This indicates that the Swin Transformer performs better in this image classification task on the Oxford Pet Dataset.

In [7]:
from collections import Counter

In [40]:
misclassified_indices_trans = np.where(predicted != y_test)[0]
misclassified_predicted_labels = predicted[misclassified_indices_trans]
misclassified_ground_truth_labels = y_test[misclassified_indices_trans]
int_to_label = {v: k for k, v in label_to_int.items()}

misclassified_ground_truth_labels_names = [int_to_label[label.item()] for label in misclassified_ground_truth_labels]
misclassified_predicted_labels_names = [int_to_label[label.item()] for label in misclassified_predicted_labels]

ground_truth_label_counts_trans = Counter(misclassified_ground_truth_labels_names)
predicted_label_counts_trans = Counter(misclassified_predicted_labels_names)
print(ground_truth_label_counts_trans)
len(ground_truth_label_counts_trans)

Counter({'Bengal': 10, 'British': 10, 'staffordshire': 10, 'Ragdoll': 9, 'american': 8, 'Russian': 8, 'Egyptian': 4, 'boxer': 4, 'miniature': 4, 'Abyssinian': 3, 'Maine': 3, 'Birman': 3, 'Bombay': 2, 'english': 2, 'Persian': 2, 'keeshond': 1, 'german': 1, 'Siamese': 1, 'saint': 1, 'samoyed': 1, 'chihuahua': 1})


21

In [17]:
import os
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from torchvision.models import resnet50, ResNet50_Weights, ResNet18_Weights, ResNet34_Weights

# Load data
def load_data_cnn(data_dir):
    # Load a pre-trained CNN model
    cnn = models.resnet50(weights=ResNet50_Weights.DEFAULT)
    # Remove the final softmax layer
    cnn = torch.nn.Sequential(*(list(cnn.children())[:-1]))

    # Define image transformations
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    features = []
    labels = []
    label_to_int = {}
    current_label_int = 0

    for filename in os.listdir(data_dir):
        if filename.endswith('.jpg'):
            image_path = os.path.join(data_dir, filename)
            image = Image.open(image_path).convert('RGB')
            image = data_transforms(image)
            image = image.unsqueeze(0)  # Add a batch dimension

            with torch.no_grad():
                feature = cnn(image).numpy().flatten()  # Extract features

            features.append(feature)
            label = filename.split('.')[0].split('_')[0]

            if label not in label_to_int:
                label_to_int[label] = current_label_int
                current_label_int += 1

            labels.append(label_to_int[label])

    features = normalize(features, axis=1)  # Normalize the features
    return np.array(features), np.array(labels), label_to_int

# Load the dataset and extract features
data_dir = 'images/'
features, labels, label_to_int = load_data_cnn(data_dir)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert numpy arrays to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [18]:
# Define the PyTorch linear classifier
class LinearClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

# Create and train the linear classifier
num_classes = len(np.unique(labels))
input_size = X_train.shape[1]
classifier = LinearClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]

        optimizer.zero_grad()
        outputs = classifier(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate the linear classifier on the test set
classifier.eval()
with torch.no_grad():
    outputs = classifier(X_test)
    _, y_pred = torch.max(outputs, 1)
    accuracy = (y_pred == y_test).sum().item() / len(y_test)

print(f"Mean-per-class accuracy: {accuracy * 100:.2f}%")

Epoch [10/100], Loss: 2.3351
Epoch [20/100], Loss: 1.6042
Epoch [30/100], Loss: 1.1577
Epoch [40/100], Loss: 0.8653
Epoch [50/100], Loss: 0.6621
Epoch [60/100], Loss: 0.5154
Epoch [70/100], Loss: 0.4066
Epoch [80/100], Loss: 0.3242
Epoch [90/100], Loss: 0.2610
Epoch [100/100], Loss: 0.2120
Mean-per-class accuracy: 74.49%


In [37]:
misclassified_indices_cnn = np.where(y_pred != y_test)[0]
misclassified_predicted_labels = y_pred[misclassified_indices_cnn]
misclassified_ground_truth_labels = y_test[misclassified_indices_cnn]
int_to_label = {v: k for k, v in label_to_int.items()}

misclassified_ground_truth_labels_names = [int_to_label[label.item()] for label in misclassified_ground_truth_labels]
misclassified_predicted_labels_names = [int_to_label[label.item()] for label in misclassified_predicted_labels]

ground_truth_label_counts_cnn = Counter(misclassified_ground_truth_labels_names)
predicted_label_counts_cnn = Counter(misclassified_predicted_labels_names)
print(ground_truth_label_counts_cnn)
print(ground_truth_label_counts_trans)

Counter({'american': 33, 'staffordshire': 33, 'Bengal': 24, 'Russian': 22, 'Abyssinian': 20, 'Ragdoll': 19, 'Bombay': 17, 'boxer': 17, 'english': 13, 'Egyptian': 12, 'leonberger': 12, 'Persian': 11, 'Maine': 11, 'Siamese': 11, 'British': 11, 'Birman': 10, 'shiba': 10, 'chihuahua': 8, 'havanese': 8, 'newfoundland': 8, 'miniature': 8, 'wheaten': 7, 'scottish': 6, 'great': 6, 'yorkshire': 6, 'beagle': 6, 'basset': 5, 'german': 4, 'japanese': 4, 'saint': 3, 'keeshond': 3, 'pomeranian': 3, 'samoyed': 2, 'Sphynx': 2, 'pug': 2})
Counter({'Bengal': 10, 'British': 10, 'staffordshire': 10, 'Ragdoll': 9, 'american': 8, 'Russian': 8, 'Egyptian': 4, 'boxer': 4, 'miniature': 4, 'Abyssinian': 3, 'Maine': 3, 'Birman': 3, 'Bombay': 2, 'english': 2, 'Persian': 2, 'keeshond': 1, 'german': 1, 'Siamese': 1, 'saint': 1, 'samoyed': 1, 'chihuahua': 1})


Overall, the transformer classifies all categories better compared to the CNN. The categories where the models compare similarly are "British", "keeshond", and "samoyed" where the number of misclassified images are similar.

The code cell below computes the indices where transformer wrong/cnn right, transformer right/cnn wrong, transformer wrong/cnn wrong. 

In [50]:
dupes = set()
for trans in misclassified_indices_trans:
    for cnn in misclassified_indices_cnn:
        if trans == cnn:
            dupes.add(trans)

trans_wrong = []
for trans in misclassified_indices_trans:
    if trans not in dupes:
        trans_wrong.append(trans)

cnn_wrong = []
for cnn in misclassified_indices_cnn:
    if cnn not in dupes:
        cnn_wrong.append(cnn)

In [62]:
trans_wrong_labels = predicted[trans_wrong]
trans_wrong_names = [int_to_label[label.item()] for label in trans_wrong_labels]
print(Counter(trans_wrong_names))
print(len(trans_wrong_names))

Counter({'Russian': 7, 'British': 3, 'Abyssinian': 2, 'pomeranian': 2, 'Birman': 2, 'Ragdoll': 2, 'staffordshire': 2, 'american': 2, 'Bengal': 2, 'boxer': 1, 'Bombay': 1, 'Persian': 1, 'scottish': 1, 'Siamese': 1, 'Egyptian': 1, 'chihuahua': 1})
31


In [63]:
cnn_wrong_labels = y_pred[cnn_wrong]
cnn_wrong_names = [int_to_label[label.item()] for label in cnn_wrong_labels]
print(Counter(cnn_wrong_names))
print(len(cnn_wrong_names))

Counter({'British': 25, 'Bengal': 23, 'american': 19, 'newfoundland': 17, 'english': 16, 'boxer': 14, 'Russian': 14, 'Maine': 14, 'Birman': 13, 'Siamese': 12, 'miniature': 10, 'Persian': 10, 'samoyed': 9, 'german': 9, 'beagle': 8, 'chihuahua': 8, 'wheaten': 8, 'Sphynx': 8, 'Ragdoll': 8, 'keeshond': 8, 'pomeranian': 7, 'staffordshire': 7, 'havanese': 7, 'Egyptian': 6, 'Abyssinian': 5, 'leonberger': 5, 'Bombay': 5, 'pug': 5, 'saint': 4, 'yorkshire': 4, 'basset': 4, 'great': 3, 'scottish': 3, 'shiba': 2})
320


In [67]:
dupes_list = list(dupes)
both_wrong = predicted[dupes_list]
both_wrong_names = [int_to_label[label.item()] for label in both_wrong]
print(Counter(both_wrong_names))
print(len(both_wrong_names))

Counter({'american': 14, 'Birman': 6, 'chihuahua': 4, 'British': 4, 'Bengal': 3, 'Ragdoll': 3, 'Maine': 3, 'Abyssinian': 3, 'Egyptian': 3, 'miniature': 2, 'Siamese': 2, 'german': 2, 'staffordshire': 2, 'great': 1, 'Bombay': 1, 'saint': 1, 'boxer': 1, 'Russian': 1, 'Sphynx': 1})
57


As seen in the results above, there are 31 images where the transformer gets it wrong and the cnn gets it right, out of the 31 images, most categories only occur once or twice except for the 'Russian' category where the transformer misclassifies images in the category 7 times and the cnn gets it right.

There are 320 images where the cnn gets it wrong and the transformer gets it right. The categories that appear the most (>15 occurences) are 'British', 'Bengal', 'american', 'newfoundland', and 'english'. 

There are 57 images where both models get it wrong, the categories that appear the most are 'american' and 'Birman', the rest have less than 5 occurences.

# Bonus Problem 6 - BatchNorm - Investigations (10 bonus points)

$x_i = w^T h_i +b$

Batch Norm: $\^{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2+\epsilon}}$

where $\mu_B = \frac{1}{m}\sum_{i=1}^m x_i$ and $\sigma_B^2 = \frac{1}{m}\sum_{i=1}^m(x_i - \mu _B)^2$

We can substitute:

$\^{x_i} = \frac{(w^T h_i + b) - \frac{1}{m}\sum_{i=1}^m (w^T h_i + b)}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu_B)^2 + \epsilon}}$

$\^{x_i} = \frac{w^T h_i + (b - \frac{1}{m}\sum_{i=1}^m (w^T h_i + b))}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu_B)^2 + \epsilon}}$

$\^{x_i} = \frac{w^T h_i}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu _b)^2 + \epsilon}} +  \frac{(b - \frac{1}{m}\sum_{i=1}^m)}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu _b)^2 + \epsilon}}$

$\^{x_i} = \frac{w^T h_i}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu _b)^2 + \epsilon}} +  \frac{(b - \frac{1}{m}\sum_{i=1}^m)}{\sqrt{\frac{1}{m}\sum_{i=1}^m(x_i - \mu _b)^2 + \epsilon}}$

$\^{x_i} = \frac{w^T}{\sqrt{\sigma_B^2+\epsilon}}h_i + \frac{b - \mu_B}{\sqrt{\sigma_B^2+\epsilon}}$

$\^{x_i} = \frac{w^T}{\sqrt{\sigma_B^2+\epsilon}}h_i + (\frac{b}{\sqrt{\sigma_B^2+\epsilon}} - \frac{\mu_B}{\sqrt{\sigma_B^2+\epsilon}})$

The first term is the new weight and the second term (second + third term) is the new noise. Therefore we can see that using batch normalization we have a bias term $-\frac{\mu_B}{\sqrt{\sigma_B^2+\epsilon}}$ so don't need an additional bias term in the neural network architecture and we can see that the weight is scaled by $\frac{1}{\sqrt{\sigma_B^2+\epsilon}}$

# References

1. ChatGPT
2. https://stats.stackexchange.com/questions/304755/pros-and-cons-of-weight-normalization-vs-batch-normalization
3. https://medium.com/thecyphy/train-cnn-model-with-pytorch-21dafb918f48
4. https://huggingface.co/timm/swin_tiny_patch4_window7_224.ms_in22k
5. https://github.com/huggingface/pytorch-image-models
6. https://www.kaggle.com/datasets/nachiket273/visiontransformerpretrainedimagenet1kweights
7. https://pytorch.org/vision/stable/models/swin_transformer.html
8. https://github.com/berniwal/swin-transformer-pytorch
9. https://www.reddit.com/r/MachineLearning/comments/ti0u6i/d_complete_guide_of_swin_transformer_with_full/
10. https://www.kaggle.com/code/residentmario/batch-normalization-and-its-successors/notebook