### Importing Required Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

### Defining the seed

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)

### Defining the model and training parameters

In [None]:
BATCH_SIZE = 128
LEARNING_RATE = 3e-4
PATCH_SIZE = 4
NUM_CLASSES = 10
IMAGE_SIZE = 32
CHANNELS = 3
EMBED_DIM = 256
NUM_HEAD = 8
DEPTH = 6
MLP_DIM = 512
DROP_RATE = 0.1

### Initial Transformations applied to test and train data

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                     std=[0.2023, 0.1994, 0.2010])
])

### Loading the dataset and defining the train and test loader

In [None]:
def load_data(transformation):
  train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transformation)
  test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transformation)
  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
  return train_dataset, test_dataset, train_loader, test_loader

In [None]:
train_dataset, test_dataset, train_loader, test_loader = load_data(transform)

### Defining PatchEmbedding module that splits an image into patches using a convolution layer, flattens them into embeddings, adds a learnable class token and positional encodings, and outputs the sequence for a Vision Transformer.

In [None]:
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim, stride=None):
        super().__init__()
        self.patch_size = patch_size
        self.stride = stride if stride is not None else patch_size

        self.proj = nn.Conv2d(
            in_channels=in_channels,
            out_channels=embed_dim,
            kernel_size=patch_size,
            stride=self.stride
        )

        # Compute number of patches per dimension
        num_patches_per_dim = ((img_size - patch_size) // self.stride) + 1
        num_patches = num_patches_per_dim ** 2

        # Class token + positional embedding
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim))

    def forward(self, x: torch.Tensor):
        B = x.size(0)
        x = self.proj(x)                        # [B, embed_dim, H', W']
        x = x.flatten(2).transpose(1, 2)        # [B, num_patches, embed_dim]
        cls_token = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls_token, x], dim=1)    # [B, 1+num_patches, embed_dim]
        x = x + self.pos_embed
        return x


In [None]:
class MLP(nn.Module):
  def __init__(self,
               in_features,
               hidden_features,
               drop_rate):
    super().__init__()
    self.fc1 = nn.Linear(in_features=in_features,
                         out_features=hidden_features)
    self.drop = nn.Dropout(p=drop_rate)
    self.fc2 = nn.Linear(in_features=hidden_features,
                         out_features=in_features)

  def forward(self, x):
    x = self.drop(F.gelu(self.fc1(x)))
    x = self.drop(self.fc2(x))
    return x

### Implementing a Multi-Head Self-Attention layer, which projects input embeddings into queries, keys, and values, computes scaled dot-product attention across multiple heads, and combines the results into the output embedding.

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super().__init__()
        # Ensure embedding dimension is divisible by number of heads
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads  # Dimension per head

        # Linear layer to compute queries, keys, and values in one go
        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, N, D = x.shape  # B: batch size, N: sequence length, D: embedding dim

        # Compute queries, keys, values and reshape for multi-head attention
        qkv = self.qkv(x)  # Shape: (B, N, 3*D)
        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim)  # (B, N, 3, num_heads, head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # Rearrange to (3, B, num_heads, N, head_dim)
        Q, K, V = qkv[0], qkv[1], qkv[2]  # Split into queries, keys, and values

        # Compute scaled dot-product attention
        scores = (Q @ K.transpose(-2, -1)) / (self.head_dim ** 0.5)  # (B, num_heads, N, N)
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = attn @ V  # (B, num_heads, N, head_dim)

        # Concatenate heads and reshape back to (B, N, D)
        out = out.transpose(1, 2).reshape(B, N, D)

        # Final linear projection
        out = self.proj(out)
        return out


### Defining Transformer Encoder Layer, which applies layer normalization, multi-head self-attention, and a feed-forward MLP with residual connections to process input embeddings.

In [None]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, embed_dim, num_head, mlp_dim,drop_rate):
    super().__init__()
    self.norm1 = nn.LayerNorm(embed_dim)
    self.attn = MultiHeadSelfAttention(embed_dim=embed_dim,
                                      num_heads=num_head,
                                      dropout=drop_rate)
    self.norm2 = nn.LayerNorm(embed_dim)
    self.mlp = MLP(embed_dim, mlp_dim, drop_rate)

  def forward(self, x):
    x = x + self.attn(self.norm1(x))
    x = x + self.mlp(self.norm2(x))
    return x

### Defining Vision Transformer (ViT) model that converts an image into patch embeddings, processes them through stacked transformer encoder layers, normalizes the output, and uses the class token for final classification.

In [None]:
class VisionTransformer(nn.Module):
  def __init__(self, img_size, patch_size, in_channels, num_classes, embed_dim, depth, num_heads, mlp_dim, drop_rate, stride = None):
    super().__init__()
    self.patch_embed = PatchEmbedding(img_size=img_size,
                                      patch_size=patch_size,
                                      in_channels=in_channels,
                                      embed_dim=embed_dim,
                                      stride = stride)
    self.encoder = nn.Sequential(
        *[TransformerEncoderLayer(embed_dim, num_heads, mlp_dim, drop_rate)
        for _ in range(depth)]
    )

    self.norm = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim, num_classes)

  def forward(self,x):
    x = self.patch_embed(x)
    x = self.encoder(x)
    x = self.norm(x)
    cls_token = x[:,0]
    return self.head(cls_token)

### Inistantiating our baseline model

In [None]:
model = VisionTransformer(
    IMAGE_SIZE, PATCH_SIZE, CHANNELS,NUM_CLASSES,
    EMBED_DIM, DEPTH, NUM_HEAD, MLP_DIM, DROP_RATE
).to(device)

In [None]:
model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params = model.parameters(), lr = LEARNING_RATE)

### This function trains a PyTorch model for one epoch, updating weights, computing loss, and returning the average loss and accuracy.

In [None]:
def train(model, loader, optimizer, criterion, scheduler=None):
    model.train()  # Set model to training mode

    total_loss = 0
    total_correct = 0

    for data, target in loader:
        data, target = data.to(device), target.to(device)  # Move to GPU if available
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * data.size(0)  # Accumulate loss
        _, pred = torch.max(out.data, 1)          # Get predictions
        total_correct += (pred == target).sum().item()  # Count correct predictions

    # Return average loss and accuracy
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)


### This function evaluates a PyTorch model on a dataset, returning its accuracy without updating the model’s weights.

In [None]:
def evaluate(model, loader):
    model.eval()  # Set model to evaluation mode (disables dropout, etc.)
    correct = 0

    with torch.inference_mode():  # Disable gradient computation for efficiency
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            out = model(data)
            correct += (out.argmax(dim=1) == target).sum().item()  # Count correct predictions

    return correct / len(loader.dataset)

### This function trains a model for multiple epochs, tracks training and test accuracies, and prints progress after each epoch.

In [None]:
def train_model(model, optimizer, criterion, epochs=15, scheduler=None):
    train_accuracies = []
    test_accuracies = []

    for epoch in tqdm(range(epochs)):
        # Train for one epoch
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, scheduler)

        # Evaluate on test data
        test_acc = evaluate(model, test_loader)

        if scheduler:
            scheduler.step()

        # Store accuracies
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)

        # Print progress
        print(f"Epoch: {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

    return train_accuracies, test_accuracies


In [None]:
train_accuracies, test_accuracies = train_model(model, optimizer, criterion)

### Ploting the Epoch vs Accuracy for visualizing the trend

In [None]:
def Accuracy_plot(train_accuracies, test_accuracies):
  plt.figure(figsize = (10,7))
  plt.plot(train_accuracies, label = 'Train_accuracies')
  plt.plot(test_accuracies, label = 'Test_accuracies')
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend()
  plt.title("Trainig and Test Accuracy")
  plt.show()

In [None]:
Accuracy_plot(train_accuracies, test_accuracies)

### Calculating the total accuracy considering the complete test datatset

In [None]:
def total_test_accuracy(model):
    overall_test_accuracy = None
    total_correct = 0
    model.eval()  # Set model to evaluation mode

    for data, image in test_loader:
        data, image = data.to(device), image.to(device)  # Move to GPU if available
        out = model(data)
        _, predicted = torch.max(out.data, 1)  # Get predicted class
        total_correct += (predicted == image).sum().item()  # Count correct predictions

    # Compute overall accuracy as a percentage
    overall_test_accuracy = total_correct / len(test_loader.dataset)
    return overall_test_accuracy * 100


In [None]:
inital_accuracy = total_test_accuracy(model)
inital_accuracy

# Experiments

### 1. Let's apply some Data Augumentation technique

In [None]:
### Chaning transform variable
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                     std=[0.2023, 0.1994, 0.2010]) ## actual calculated statistics for the dataset
])

In [None]:
train_dataset, test_dataset, train_loader, test_loader = load_data(transform_train)

In [None]:
## Retraining the model
model_2 = VisionTransformer(
    IMAGE_SIZE, PATCH_SIZE, CHANNELS,NUM_CLASSES,
    EMBED_DIM, DEPTH, NUM_HEAD, MLP_DIM, DROP_RATE
).to(device)
optimizer_2 = optim.Adam(params = model_2.parameters(), lr = LEARNING_RATE)

In [None]:
epochs_model_2 = 15

In [None]:
train_accuracies, test_accuracies = train_model(model_2, optimizer_2, criterion, epochs_model_2)

In [None]:
Accuracy_plot(train_accuracies, test_accuracies)

In [None]:
augemented_data_accuracy = total_test_accuracy(model_2)
augemented_data_accuracy

### 2. Let's use the AdamW optimizer with a scheduler and augmented data.

In [None]:
epochs_model_3 = 15

In [None]:
model_3 = VisionTransformer(
    IMAGE_SIZE, PATCH_SIZE, CHANNELS,NUM_CLASSES,
    EMBED_DIM, DEPTH, NUM_HEAD, MLP_DIM, DROP_RATE
).to(device)
optimizer_3 = optim.AdamW(model_3.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_3, T_max=epochs_model_3)
criterion = nn.CrossEntropyLoss()

In [None]:
train_accuracies, test_accuracies = train_model(model_3, optimizer_3, criterion, epochs_model_3, scheduler)

In [None]:
Accuracy_plot(train_accuracies, test_accuracies)
augumented_adamw_accuracy = total_test_accuracy(model_3)
augumented_adamw_accuracy

### 3. Let's do analysis with Overlapping Patches. Here we are using Adam instead of AdamW because we saw that it was giving less accuracy for same number of epochs

In [None]:
epochs_model_4 = 20

In [None]:
model_4 = VisionTransformer(
    IMAGE_SIZE, PATCH_SIZE, CHANNELS,NUM_CLASSES,
    EMBED_DIM, DEPTH, NUM_HEAD, MLP_DIM, DROP_RATE, stride = 2
).to(device)
optimizer_4 = optim.Adam(params = model_4.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [None]:
train_accuracies, test_accuracies = train_model(model_4, optimizer_4, criterion, epochs_model_4)

In [None]:
Accuracy_plot(train_accuracies, test_accuracies)
overlaped_patches_accuracy = total_test_accuracy(model_4)
overlaped_patches_accuracy

# Summary

In [None]:
import pandas as pd

results = {
    "Experiment": [
        "Baseline - Non Overlapping, No data augumentation (15 epochs)",
        "With Data Augmentation (15 epochs)",
        "Augmentation + AdamW + Scheduler (15 epochs)",
        "Augmentation + Overlapping Patches + Adam (20 epochs)"
    ],
    "Test Accuracy (%)": [
        65.48,
        69.14,
        69.28,
        76.27
    ]
}

df = pd.DataFrame(results)
from tabulate import tabulate
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))


### Selecting random images from the dataset and predicted the label using our trained model

In [None]:
def predict_and_plot_grid(model,
                          dataset,
                          classes,
                          grid_size=3):
  model.eval()
  fig, axes = plt.subplots(grid_size, grid_size, figsize=(9,9))
  for i in range(grid_size):
    for j in range(grid_size):
      idx = random.randint(0, len(dataset)-1)
      img, true_label = dataset[idx]
      input_tensor = img.unsqueeze(dim = 0).to(device)
      with torch.inference_mode():
        output = model(input_tensor)
        _, predicted = torch.max(output.data, 1)
      img = img/2 + 0.5
      npimg = img.cpu().numpy()
      axes[i,j].imshow(np.transpose(npimg, (1,2,0)))
      truth = classes[true_label] == classes[predicted.item()]
      if truth:
        color = 'g'
      else:
        color = 'r'

      axes[i,j].set_title(f"Truth : {classes[true_label]}\n, Predicted: {classes[predicted.item()]}", fontsize = 10, c = color)
      axes[i,j].axis('off')
  plt.tight_layout()
  plt.show()

In [None]:
predict_and_plot_grid(model_4, test_dataset, test_dataset.classes)