In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm
import random
import os
import csv
import time
import itertools
import matplotlib.pyplot as plt

In [None]:
!nvidia-smi

Sat Dec  7 18:35:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              48W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
class VisionTransformer(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, num_classes, dim, depth, heads, mlp_dim, dropout=0.1):
        super(VisionTransformer, self).__init__()

        # Ensure image size divisible by patch size
        assert img_size % patch_size == 0, "Image size must be divisible by patch size"
        num_patches = (img_size // patch_size) ** 2

        self.patch_embedding = nn.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size)

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))

        self.transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=dim,
                nhead=heads,
                dim_feedforward=mlp_dim,
                dropout=dropout,
                batch_first=True
            ) for _ in range(depth)
        ])

        # MLP head
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        batch_size = x.size(0)

        x = self.patch_embedding(x)  # shape: (batch_size, dim, num_patches_x, num_patches_y)
        x = x.flatten(2).transpose(1, 2)  # shape: (batch_size, num_patches, dim)

        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        x += self.pos_embedding

        for layer in self.transformer:
            x = layer(x)

        cls_output = x[:, 0]
        return self.mlp_head(cls_output)

In [None]:
def train_model(model, trainloader, criterion, optimizer, scheduler, device, num_epochs):
    model.to(device)
    start_time = time.time()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        loop = tqdm(trainloader, desc=f'Epoch [{epoch+1}/{num_epochs}]', leave=False)

        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        scheduler.step()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(trainloader):.4f}")

    training_time = time.time() - start_time
    print(f"Training Time: {training_time:.2f} seconds")
    return training_time

def evaluate_model(model, testloader, device):
    model.eval()
    correct = 0
    total = 0
    start_time = time.time()

    with torch.no_grad():
        loop = tqdm(testloader, desc="Evaluating", leave=False)
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    inference_time = time.time() - start_time
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%, Inference Time: {inference_time:.2f} seconds")
    return accuracy, inference_time

def hyperparameter_tuning(parameters_to_test, trainloader, valloader, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    best_accuracy = 0.0
    best_params = {}

    csv_file = 'vit_tuning_results.csv'
    file_exists = os.path.isfile(csv_file)
    with open(csv_file, mode='a', newline='') as csvfile:
        fieldnames = [
            'Patch Size', 'Dim', 'Depth', 'Heads', 'MLP Dim', 'Dropout',
            'Learning Rate', 'Training Time (s)', 'Inference Time (s)', 'Validation Accuracy (%)'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()

        combinations = random.sample(list(itertools.product(
            parameters_to_test['patch_sizes'],
            parameters_to_test['dims'],
            parameters_to_test['depths'],
            parameters_to_test['heads'],
            parameters_to_test['mlp_dims'],
            parameters_to_test['dropouts'],
            parameters_to_test['learning_rates'])), 20)

        for patch_size, dim, depth, heads, mlp_dim, dropout, lr in combinations:
            print(f"Testing combination: Patch Size={patch_size}, Dim={dim}, Depth={depth}, Heads={heads}, MLP Dim={mlp_dim}, Dropout={dropout}, LR={lr}")

            model = VisionTransformer(
                img_size=32,
                patch_size=patch_size,
                in_channels=3,
                num_classes=10,
                dim=dim,
                depth=depth,
                heads=heads,
                mlp_dim=mlp_dim,
                dropout=dropout
            ).to(device)

            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

            training_time = train_model(model, trainloader, criterion, optimizer, scheduler, device, num_epochs)
            accuracy, inference_time = evaluate_model(model, valloader, device)  # Evaluate on validation set

            # Log the results
            writer.writerow({
                'Patch Size': patch_size,
                'Dim': dim,
                'Depth': depth,
                'Heads': heads,
                'MLP Dim': mlp_dim,
                'Dropout': dropout,
                'Learning Rate': lr,
                'Training Time (s)': training_time,
                'Inference Time (s)': inference_time,
                'Validation Accuracy (%)': accuracy
            })

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'patch_size': patch_size,
                    'dim': dim,
                    'depth': depth,
                    'heads': heads,
                    'mlp_dim': mlp_dim,
                    'dropout': dropout,
                    'lr': lr
                }

    return best_params

In [None]:
cifar10 = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

mean = torch.zeros(3)
std = torch.zeros(3)

for img, _ in cifar10:
    mean += img.mean([1, 2])
    std += img.std([1, 2])

mean /= len(cifar10)
std /= len(cifar10)

mean_tuple = tuple(mean.numpy())
std_tuple = tuple(std.numpy())

print(mean_tuple, std_tuple)

# Transformations for training data with advanced augmentations
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean_tuple, std_tuple)
])

# Transformations for test data
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean_tuple, std_tuple)
])

# Load training and test datasets
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# Data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

Files already downloaded and verified
(0.49421427, 0.4851322, 0.45040995) (0.20199372, 0.19911827, 0.20113052)
Files already downloaded and verified
Files already downloaded and verified


In [None]:
parameters_to_test = {
    'patch_sizes': [4, 8],
    'dims': [128, 256],
    'depths': [6, 8],
    'heads': [4, 8],
    'mlp_dims': [512, 1024],
    'dropouts': [0.1, 0.3],
    'learning_rates': [1e-3, 5e-4]
}

best_params = hyperparameter_tuning(parameters_to_test, trainloader, testloader, num_epochs=10)
print(f"Best Parameters: {best_params}")

Testing combination: Patch Size=8, Dim=256, Depth=6, Heads=4, MLP Dim=1024, Dropout=0.3, LR=0.001
Epoch [1/10], Loss: 2.0907
Epoch [2/10], Loss: 1.9433
Epoch [3/10], Loss: 1.8691
Epoch [4/10], Loss: 1.8081
Epoch [5/10], Loss: 1.7564
Epoch [6/10], Loss: 1.6190
Epoch [7/10], Loss: 1.5536
Epoch [8/10], Loss: 1.5191
Epoch [9/10], Loss: 1.4825
Epoch [10/10], Loss: 1.4553
Training Time: 113.26 seconds
Accuracy: 53.89%, Inference Time: 1.45 seconds
Testing combination: Patch Size=4, Dim=256, Depth=8, Heads=8, MLP Dim=512, Dropout=0.3, LR=0.001
Epoch [1/10], Loss: 2.0837
Epoch [2/10], Loss: 1.8654
Epoch [3/10], Loss: 1.7694
Epoch [4/10], Loss: 1.6396
Epoch [5/10], Loss: 1.5471
Epoch [6/10], Loss: 1.4047
Epoch [7/10], Loss: 1.3486
Epoch [8/10], Loss: 1.2872
Epoch [9/10], Loss: 1.2510
Epoch [10/10], Loss: 1.2113
Training Time: 444.40 seconds
Accuracy: 63.26%, Inference Time: 3.04 seconds
Testing combination: Patch Size=4, Dim=256, Depth=8, Heads=4, MLP Dim=1024, Dropout=0.3, LR=0.001
Epoch [1/10

In [None]:
# best_params = {}
# best_params['patch_size'] = 4
# best_params['dim'] = 256
# best_params['depth'] = 8
# best_params['heads'] = 8
# best_params['mlp_dim'] = 512
# best_params['dropout'] = 0.1
# best_params['lr'] = 0.0005

In [None]:
# Train best model
def final_model_evaluation(best_params, trainloader, testloader, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialization
    final_model = VisionTransformer(
        img_size=32,
        patch_size=best_params['patch_size'],
        in_channels=3,
        num_classes=10,
        dim=best_params['dim'],
        depth=best_params['depth'],
        heads=best_params['heads'],
        mlp_dim=best_params['mlp_dim'],
        dropout=best_params['dropout']
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(final_model.parameters(), lr=best_params['lr'], weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    training_start_time = time.time()
    train_model(final_model, trainloader, criterion, optimizer, scheduler, device, num_epochs)
    training_time = time.time() - training_start_time

    # Evaluation
    test_accuracy, test_inference_time = evaluate_model(final_model, testloader, device)

    print(f"Final Model Evaluation:")
    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Test Accuracy: {test_accuracy:.2f}%")
    print(f"Test Inference Time: {test_inference_time:.2f} seconds")

    return {
        'training_time': training_time,
        'test_accuracy': test_accuracy,
        'test_inference_time': test_inference_time
    }

final_metrics = final_model_evaluation(best_params, trainloader, testloader, num_epochs=31)
print(final_metrics)

Epoch [1/31], Loss: 1.8439
Epoch [2/31], Loss: 1.5073
Epoch [3/31], Loss: 1.3594
Epoch [4/31], Loss: 1.2673
Epoch [5/31], Loss: 1.1837
Epoch [6/31], Loss: 1.0479
Epoch [7/31], Loss: 0.9964
Epoch [8/31], Loss: 0.9600
Epoch [9/31], Loss: 0.9200
Epoch [10/31], Loss: 0.8838
Epoch [11/31], Loss: 0.8165
Epoch [12/31], Loss: 0.7810
Epoch [13/31], Loss: 0.7585
Epoch [14/31], Loss: 0.7372
Epoch [15/31], Loss: 0.7188
Epoch [16/31], Loss: 0.6744
Epoch [17/31], Loss: 0.6539
Epoch [18/31], Loss: 0.6449
Epoch [19/31], Loss: 0.6300
Epoch [20/31], Loss: 0.6202
Epoch [21/31], Loss: 0.5953
Epoch [22/31], Loss: 0.5855
Epoch [23/31], Loss: 0.5813
Epoch [24/31], Loss: 0.5730
Epoch [25/31], Loss: 0.5607
Epoch [26/31], Loss: 0.5511
Epoch [27/31], Loss: 0.5481
Epoch [28/31], Loss: 0.5413
Epoch [29/31], Loss: 0.5388
Epoch [30/31], Loss: 0.5383
Epoch [31/31], Loss: 0.5306
Training Time: 1724.29 seconds
Inference Time: 5.04 seconds

Final Model Evaluation:
Training Time: 1724.29 seconds
Test Accuracy: 84.79%
Tes