In [1]:
import torch
import torch.nn as nn

class MultiheadAttentionEinsum(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiheadAttentionEinsum, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.q_linear = nn.Linear(embedding_dim, embedding_dim)
        self.k_linear = nn.Linear(embedding_dim, embedding_dim)
        self.v_linear = nn.Linear(embedding_dim, embedding_dim)
        self.fc_out = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, query, key, value):
        batch_size = query.size(0)


        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)


        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)


        scores = torch.einsum("nqhd,nkhd->nhqk", Q, K) / (self.head_dim ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        out = torch.einsum("nhqk,nkhd->nqhd", attn, V).reshape(batch_size, -1, self.num_heads * self.head_dim)


        out = self.fc_out(out)
        return out

In [2]:
import torch
import torch.nn as nn

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(TransformerEncoderLayer, self).__init__()
        self.multihead_attention = MultiheadAttentionEinsum(embed_dim=embedding_dim, num_heads=num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, embedding_dim)
        )
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = x.permute(1, 0, 2)
        attn_output = self.multihead_attention(x, x, x)[0]
        x = attn_output + residual
        x = x.permute(1, 0, 2)

        residual = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = x + residual

        return x

In [3]:
class VisionTransformer(nn.Module):
    def __init__(self, num_classes, patch_size, embedding_dim, num_heads, num_layers):
        super(VisionTransformer, self).__init__()
        self.patch_embedding = nn.Conv2d(3, embedding_dim, kernel_size=patch_size, stride=patch_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 14 * 14 + 1, embedding_dim))
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.patch_embedding(x)
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat((x, self.positional_encoding.repeat(batch_size, 1, 1)), dim=1)
        for layer in self.transformer_layers:
            x = layer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

In [4]:
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001
num_classes = 10
patch_size = 16
embedding_dim = 128
num_heads = 8
num_layers = 3

# CIFAR-10 dataset preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)

# Data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:13<00:00, 12532544.57it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
import torch.optim as optim

# Initialize the model
model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_steps = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)


        outputs = model(images)
        loss = criterion(outputs, labels)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [100/782], Loss: 2.1960
Epoch [1/10], Step [200/782], Loss: 2.0369
Epoch [1/10], Step [300/782], Loss: 1.9783
Epoch [1/10], Step [400/782], Loss: 2.0499
Epoch [1/10], Step [500/782], Loss: 2.1027
Epoch [1/10], Step [600/782], Loss: 1.9756
Epoch [1/10], Step [700/782], Loss: 1.9870
Epoch [2/10], Step [100/782], Loss: 2.0334
Epoch [2/10], Step [200/782], Loss: 2.0514
Epoch [2/10], Step [300/782], Loss: 1.8165
Epoch [2/10], Step [400/782], Loss: 1.7043
Epoch [2/10], Step [500/782], Loss: 1.8695
Epoch [2/10], Step [600/782], Loss: 1.9248
Epoch [2/10], Step [700/782], Loss: 1.8271
Epoch [3/10], Step [100/782], Loss: 2.0409
Epoch [3/10], Step [200/782], Loss: 1.6809
Epoch [3/10], Step [300/782], Loss: 1.7645
Epoch [3/10], Step [400/782], Loss: 1.7958
Epoch [3/10], Step [500/782], Loss: 1.7031
Epoch [3/10], Step [600/782], Loss: 1.5427
Epoch [3/10], Step [700/782], Loss: 1.5633
Epoch [4/10], Step [100/782], Loss: 1.5802
Epoch [4/10], Step [200/782], Loss: 1.6690
Epoch [4/10

In [6]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the {total} test images: {accuracy:.2f}%')

Test Accuracy of the model on the 10000 test images: 54.00%


In [7]:
#new hyperparameters for faster training
num_epochs = 5
batch_size = 64
learning_rate = 0.001


fast_model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)


optimizer = optim.Adam(fast_model.parameters(), lr=learning_rate)


total_steps = len(train_loader)
for epoch in range(num_epochs):
    fast_model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)


        outputs = fast_model(images)
        loss = criterion(outputs, labels)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item():.4f}')


fast_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = fast_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the fast model on the {total} test images: {accuracy:.2f}%')

Epoch [1/5], Step [100/782], Loss: 2.0736
Epoch [1/5], Step [200/782], Loss: 2.0383
Epoch [1/5], Step [300/782], Loss: 1.9827
Epoch [1/5], Step [400/782], Loss: 1.9877
Epoch [1/5], Step [500/782], Loss: 2.0787
Epoch [1/5], Step [600/782], Loss: 1.8285
Epoch [1/5], Step [700/782], Loss: 1.9361
Epoch [2/5], Step [100/782], Loss: 1.9641
Epoch [2/5], Step [200/782], Loss: 1.7718
Epoch [2/5], Step [300/782], Loss: 1.8143
Epoch [2/5], Step [400/782], Loss: 1.7858
Epoch [2/5], Step [500/782], Loss: 1.6520
Epoch [2/5], Step [600/782], Loss: 1.6235
Epoch [2/5], Step [700/782], Loss: 1.5844
Epoch [3/5], Step [100/782], Loss: 1.6691
Epoch [3/5], Step [200/782], Loss: 1.7881
Epoch [3/5], Step [300/782], Loss: 1.6308
Epoch [3/5], Step [400/782], Loss: 1.6871
Epoch [3/5], Step [500/782], Loss: 1.5572
Epoch [3/5], Step [600/782], Loss: 1.5729
Epoch [3/5], Step [700/782], Loss: 1.5953
Epoch [4/5], Step [100/782], Loss: 1.6539
Epoch [4/5], Step [200/782], Loss: 1.4734
Epoch [4/5], Step [300/782], Loss:

it seems that these parameters don't have better results than before. so I will test new hyperparameters in the below

In [8]:
#new hyperparameters
num_epochs = 10
batch_size = 128
learning_rate = 0.0001
embedding_dim = 192
num_heads = 12
num_layers = 5

better_model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)

optimizer = optim.Adam(better_model.parameters(), lr=learning_rate)

total_steps = len(train_loader)
for epoch in range(num_epochs):
    better_model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)


        outputs = better_model(images)
        loss = criterion(outputs, labels)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item():.4f}')

better_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = better_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the better model on the {total} test images: {accuracy:.2f}%')


Epoch [1/10], Step [100/782], Loss: 2.1389
Epoch [1/10], Step [200/782], Loss: 2.0677
Epoch [1/10], Step [300/782], Loss: 2.0865
Epoch [1/10], Step [400/782], Loss: 1.7972
Epoch [1/10], Step [500/782], Loss: 1.8876
Epoch [1/10], Step [600/782], Loss: 1.7640
Epoch [1/10], Step [700/782], Loss: 1.9183
Epoch [2/10], Step [100/782], Loss: 1.8415
Epoch [2/10], Step [200/782], Loss: 1.7543
Epoch [2/10], Step [300/782], Loss: 1.6258
Epoch [2/10], Step [400/782], Loss: 1.7255
Epoch [2/10], Step [500/782], Loss: 1.6265
Epoch [2/10], Step [600/782], Loss: 1.6652
Epoch [2/10], Step [700/782], Loss: 1.6803
Epoch [3/10], Step [100/782], Loss: 1.6722
Epoch [3/10], Step [200/782], Loss: 1.5482
Epoch [3/10], Step [300/782], Loss: 1.6027
Epoch [3/10], Step [400/782], Loss: 1.5421
Epoch [3/10], Step [500/782], Loss: 1.4406
Epoch [3/10], Step [600/782], Loss: 1.4171
Epoch [3/10], Step [700/782], Loss: 1.5027
Epoch [4/10], Step [100/782], Loss: 1.4334
Epoch [4/10], Step [200/782], Loss: 1.5702
Epoch [4/10

In [9]:
#vgg
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

#Hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001
num_classes = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)


train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


vgg_model = models.vgg16(pretrained=True)


vgg_model.classifier[6] = nn.Linear(vgg_model.classifier[6].in_features, num_classes)


vgg_model = vgg_model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vgg_model.parameters(), lr=learning_rate)

Files already downloaded and verified
Files already downloaded and verified


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:07<00:00, 72.0MB/s]


In [10]:
#training loop
total_steps = len(train_loader)
for epoch in range(num_epochs):
    vgg_model.train()
    running_loss = 0.0

    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)


        outputs = vgg_model(images)
        loss = criterion(outputs, labels)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{total_steps}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [100/782], Loss: 2.0554
Epoch [1/10], Step [200/782], Loss: 2.0801
Epoch [1/10], Step [300/782], Loss: 1.9316
Epoch [1/10], Step [400/782], Loss: 1.8572
Epoch [1/10], Step [500/782], Loss: 2.0636
Epoch [1/10], Step [600/782], Loss: 1.6279
Epoch [1/10], Step [700/782], Loss: 1.8516
Epoch [2/10], Step [100/782], Loss: 1.6406
Epoch [2/10], Step [200/782], Loss: 1.5471
Epoch [2/10], Step [300/782], Loss: 1.5558
Epoch [2/10], Step [400/782], Loss: 1.5739
Epoch [2/10], Step [500/782], Loss: 1.2151
Epoch [2/10], Step [600/782], Loss: 1.5280
Epoch [2/10], Step [700/782], Loss: 1.3711
Epoch [3/10], Step [100/782], Loss: 0.9659
Epoch [3/10], Step [200/782], Loss: 1.3221
Epoch [3/10], Step [300/782], Loss: 1.3090
Epoch [3/10], Step [400/782], Loss: 0.9485
Epoch [3/10], Step [500/782], Loss: 1.0455
Epoch [3/10], Step [600/782], Loss: 0.9109
Epoch [3/10], Step [700/782], Loss: 1.0114
Epoch [4/10], Step [100/782], Loss: 0.8991
Epoch [4/10], Step [200/782], Loss: 0.8269
Epoch [4/10

In [11]:
vgg_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = vgg_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the {total} test images: {accuracy:.2f}%')

Test Accuracy of the model on the 10000 test images: 78.90%
