A simple code that changes the number of different attention heads and the number of transformer encoders to test the performance of the image model.

Code reference with the help of ChatGPT.

In [31]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [40]:
import torch
import torchvision
from torchvision import transforms
from transformers import ViTModel
import torch.nn as nn
from torch.utils.data import Subset

# load pretrained VIT model
model = ViTModel.from_pretrained('google/vit-base-patch16-224', num_attention_heads=8)

# set mode to eval
model.eval()

# load CIFAR-10 dataser and subsample
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
fullset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
subset_indices = list(range(0, len(fullset), 100))
subset = Subset(fullset, subset_indices)
testloader = torch.utils.data.DataLoader(subset, batch_size=16, shuffle=False, num_workers=2)

# add a linear layer to classify
classifier = nn.Linear(model.config.hidden_size, 10)
model.classifier = classifier

# use the model to do classification to the data
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        logits = model.classifier(pooled_output)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the ViT model on the CIFAR-10 test images: %d %%' % (100 * correct / total))



Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Files already downloaded and verified
Accuracy of the ViT model on the CIFAR-10 test images: 14 %


In [42]:
import torch
import torchvision
from torchvision import transforms
from transformers import ViTModel
import torch.nn as nn
from torch.utils.data import Subset

# load pretrained VIT model
model = ViTModel.from_pretrained('google/vit-base-patch16-224', num_attention_heads=4)

# set mode to eval
model.eval()

# load CIFAR-10 dataser and subsample
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
fullset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
subset_indices = list(range(0, len(fullset), 100))
subset = Subset(fullset, subset_indices)
testloader = torch.utils.data.DataLoader(subset, batch_size=16, shuffle=False, num_workers=2)

# add a linear layer to classify
classifier = nn.Linear(model.config.hidden_size, 10)
model.classifier = classifier

# use the model to do classification to the data
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        logits = model.classifier(pooled_output)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the ViT model on the CIFAR-10 test images: %d %%' % (100 * correct / total))

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Files already downloaded and verified
Accuracy of the ViT model on the CIFAR-10 test images: 12 %
