**1) Checks GPU, sets deterministic seeds, imports PyTorch/torchvision u**

In [None]:
# @title Q1: Setup & Imports (GPU check + seeds)
import os, math, time, random, json
from dataclasses import dataclass, asdict
import numpy as np

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

assert torch.cuda.is_available(), "Enable GPU: Runtime > Change runtime type > GPU"
device = torch.device("cuda")
print("GPU:", torch.cuda.get_device_name(0))

def seed_all(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True  # speed on fixed-size batches
seed_all(42)


GPU: Tesla T4


**2) Centralizes all tunables; choosepreset="strong"for"fast"if**

In [None]:
# @title Q1: Config (switch preset to 'fast' if needed)
@dataclass
class CFG:
    # data
    num_classes: int = 10
    img_size: int = 32

    # model (good for CIFAR-10 on T4)
    patch_size: int = 4
    embed_dim: int = 384
    depth: int = 8
    num_heads: int = 6
    mlp_ratio: float = 4.0
    drop_rate: float = 0.1
    attn_drop_rate: float = 0.0
    drop_path_rate: float = 0.1

    # training
    epochs: int = 100
    batch_size: int = 256
    lr: float = 3e-4
    weight_decay: float = 0.05
    warmup_epochs: int = 5
    label_smoothing: float = 0.1
    mixup_alpha: float = 0.2
    ema_decay: float = 0.999
    grad_clip: float = 1.0

    # misc
    num_workers: int = 2
    amp: bool = True                  # mixed precision
    amp_dtype: torch.dtype = torch.float16  # T4 prefers fp16 (bf16 not supported on T4)
    tta: bool = False                 # simple flip TTA at test time

def make_cfg(preset="strong"):
    c = CFG()
    if preset == "fast":
        c.embed_dim = 256
        c.depth = 6
        c.num_heads = 4
        c.batch_size = 256
        c.epochs = 50
        c.drop_path_rate = 0.05
    return c

cfg = make_cfg("strong")
print(cfg)


CFG(num_classes=10, img_size=32, patch_size=4, embed_dim=384, depth=8, num_heads=6, mlp_ratio=4.0, drop_rate=0.1, attn_drop_rate=0.0, drop_path_rate=0.1, epochs=100, batch_size=256, lr=0.0003, weight_decay=0.05, warmup_epochs=5, label_smoothing=0.1, mixup_alpha=0.2, ema_decay=0.999, grad_clip=1.0, num_workers=2, amp=True, amp_dtype=torch.float16, tta=False)


In [None]:
# @title Q1: Data & Augmentations
MEAN = (0.4914, 0.4822, 0.4465)
STD  = (0.2470, 0.2435, 0.2616)

train_tfms = transforms.Compose([
    transforms.RandomCrop(cfg.img_size, padding=4, padding_mode='reflect'),
    transforms.RandomHorizontalFlip(),
    transforms.RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD),
    transforms.RandomErasing(p=0.25, scale=(0.02,0.2), ratio=(0.3,3.3), value='random')
])

test_tfms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD),
])

root = "./data"
train_set = datasets.CIFAR10(root, train=True, download=True, transform=train_tfms)
test_set  = datasets.CIFAR10(root, train=False, download=True, transform=test_tfms)

def mixup_batch(x, y, alpha):
    if alpha <= 0:
        return x, F.one_hot(y, cfg.num_classes).float(), 1.0
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    mixed_x = lam * x + (1 - lam) * x[idx]
    y1 = F.one_hot(y, cfg.num_classes).float()
    y2 = F.one_hot(y[idx], cfg.num_classes).float()
    mixed_y = lam * y1 + (1 - lam) * y2
    return mixed_x, mixed_y, lam

train_loader = DataLoader(train_set, batch_size=cfg.batch_size, shuffle=True,
                          num_workers=cfg.num_workers, pin_memory=True, drop_last=True)
test_loader  = DataLoader(test_set,  batch_size=512, shuffle=False,
                          num_workers=cfg.num_workers, pin_memory=True)
print(f"train={len(train_set)} images, test={len(test_set)} images")


100%|██████████| 170M/170M [00:14<00:00, 11.8MB/s]


train=50000 images, test=10000 images


**3)  Pre-LN blocks, residuals, MHSA + MLP; stochastic depth (DropPath).**

In [None]:
# @title Q1: ViT model (patchify, CLS, pos-emb, MHSA+MLP blocks)
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob
    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training: return x
        keep = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        return x * (torch.rand(shape, device=x.device) < keep) / keep

class PatchEmbed(nn.Module):
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=384):
        super().__init__()
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.num_patches = (img_size // patch_size) ** 2
    def forward(self, x):                  # B,C,H,W
        x = self.proj(x)                   # B,E,H',W'
        x = x.flatten(2).transpose(1,2)    # B,N,E
        return x

class MLP(nn.Module):
    def __init__(self, dim, mlp_ratio=4.0, drop=0.0):
        super().__init__()
        h = int(dim * mlp_ratio)
        self.fc1 = nn.Linear(dim, h)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(h, dim)
        self.drop = nn.Dropout(drop)
    def forward(self, x):
        x = self.fc1(x); x = self.act(x); x = self.drop(x)
        x = self.fc2(x); x = self.drop(x)
        return x

class Attention(nn.Module):
    def __init__(self, dim, heads=8, attn_drop=0.0, proj_drop=0.0):
        super().__init__()
        self.heads = heads
        self.scale = (dim // heads) ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2,0,3,1,4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2,-1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = attn @ v
        x = x.transpose(1,2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

class Block(nn.Module):
    def __init__(self, dim, heads, mlp_ratio, drop=0.0, attn_drop=0.0, drop_path=0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn  = Attention(dim, heads, attn_drop, drop)
        self.drop_path = DropPath(drop_path)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp   = MLP(dim, mlp_ratio, drop)
    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, img_size, patch_size, in_chans, num_classes,
                 embed_dim, depth, num_heads, mlp_ratio,
                 drop_rate, attn_drop_rate, drop_path_rate):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
        n = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1,1,embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, n+1, embed_dim))
        self.pos_drop  = nn.Dropout(drop_rate)

        dpr = torch.linspace(0, drop_path_rate, steps=depth).tolist()
        self.blocks = nn.ModuleList([
            Block(embed_dim, num_heads, mlp_ratio, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i])
            for i in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        self.apply(self._init)

    @staticmethod
    def _init(m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None: nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias); nn.init.ones_(m.weight)

    def forward(self, x):
        x = self.patch_embed(x)              # B,N,C
        B, N, C = x.shape
        cls = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls, x], dim=1)       # B,N+1,C
        x = x + self.pos_embed[:, :N+1, :]
        x = self.pos_drop(x)
        for blk in self.blocks: x = blk(x)
        x = self.norm(x)
        return self.head(x[:,0])


In [None]:
# @title Q1: Loss, EMA, Scheduler, Metrics
class SoftCELoss(nn.Module):
    def __init__(self, smoothing=0.0):
        super().__init__()
        self.smoothing = smoothing
    def forward(self, logits, target):  # target: hard labels or one-hot
        if target.ndim == 1:
            target = F.one_hot(target, logits.size(-1)).float()
        if self.smoothing > 0:
            target = target * (1 - self.smoothing) + self.smoothing / target.size(-1)
        logp = F.log_softmax(logits, dim=-1)
        return (-target * logp).sum(dim=-1).mean()

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {k: v.detach().clone() for k,v in model.state_dict().items()}
    @torch.no_grad()
    def update(self, model):
        if self.decay <= 0: return
        for k, v in model.state_dict().items():
            if v.dtype.is_floating_point:
                self.shadow[k].mul_(self.decay).add_(v, alpha=1-self.decay)
    def copy_to(self, model):
        for k, v in self.shadow.items():
            if k in model.state_dict():
                model.state_dict()[k].copy_(v)

class WarmupCosine(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        super().__init__(optimizer, last_epoch)
    def get_lr(self):
        step = self.last_epoch + 1
        if step < self.warmup_steps:
            return [base * step / max(1, self.warmup_steps) for base in self.base_lrs]
        t = (step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
        scale = 0.5 * (1 + math.cos(math.pi * t))
        return [base * scale for base in self.base_lrs]

@torch.no_grad()
def accuracy(logits, y):
    return (logits.argmax(1) == y).float().mean().item()


**4) full loop with MixUp, gradient clip, scheduler, best checkpoint, optional TTA.**

In [None]:
# @title Q1: Train & Evaluate (saves best_vit_cifar10.pth and q1_results.json)

# Define the model, optimizer, and other utilities for training
model = VisionTransformer(
    img_size=cfg.img_size, patch_size=cfg.patch_size, in_chans=3, num_classes=cfg.num_classes,
    embed_dim=cfg.embed_dim, depth=cfg.depth, num_heads=cfg.num_heads, mlp_ratio=cfg.mlp_ratio,
    drop_rate=cfg.drop_rate, attn_drop_rate=cfg.attn_drop_rate, drop_path_rate=cfg.drop_path_rate
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
criterion = SoftCELoss(smoothing=cfg.label_smoothing)
scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)
ema = EMA(model, decay=cfg.ema_decay) if cfg.ema_decay > 0 else None

total_steps = cfg.epochs * len(train_loader)
warmup_steps = cfg.warmup_epochs * len(train_loader)
sched = WarmupCosine(optimizer, warmup_steps=warmup_steps, total_steps=total_steps)

best_acc, history = 0.0, []

def eval_model(eval_m):
    eval_m.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type='cuda', dtype=cfg.amp_dtype, enabled=cfg.amp):
                logits = eval_m(x)
                if cfg.tta:
                    logits_flip = eval_m(torch.flip(x, dims=[3]))
                    logits = 0.5 * (logits + logits_flip)
            pred = logits.argmax(1)
            correct += (pred == y).sum().item()
            total += y.size(0)
    return correct / total

# Training Loop
for epoch in range(cfg.epochs):
    model.train()
    t0 = time.time()
    run_loss = run_acc = 0.0

    for x, y in train_loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        x, y_mix, _ = mixup_batch(x, y, cfg.mixup_alpha)

        optimizer.zero_grad(set_to_none=True)
        with torch.autocast(device_type='cuda', dtype=cfg.amp_dtype, enabled=cfg.amp):
            logits = model(x)
            loss = criterion(logits, y_mix)

        scaler.scale(loss).backward()
        if cfg.grad_clip > 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        sched.step()
        if ema: ema.update(model)

        run_loss += loss.item()
        run_acc  += accuracy(logits.detach(), torch.argmax(y_mix, dim=1))

    # EMA copy for eval
    if ema:
        shadow_model = VisionTransformer(cfg.img_size, cfg.patch_size, 3, cfg.num_classes,
                                        cfg.embed_dim, cfg.depth, cfg.num_heads, cfg.mlp_ratio,
                                        cfg.drop_rate, cfg.attn_drop_rate, cfg.drop_path_rate).to(device)
        shadow_model.load_state_dict(model.state_dict())
        ema.copy_to(shadow_model)
        val_acc = eval_model(shadow_model)
        del shadow_model
        torch.cuda.empty_cache()
    else:
        val_acc = eval_model(model)

    ep_loss = run_loss / len(train_loader)
    ep_acc  = run_acc  / len(train_loader)
    history.append({"epoch": epoch+1, "train_loss": ep_loss, "train_acc": ep_acc, "val_acc": val_acc})

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({"model": model.state_dict(), "cfg": asdict(cfg), "best_val_acc": best_acc}, "best_vit_cifar10.pth")

    print(f"Epoch {epoch+1:03d}/{cfg.epochs} | loss {ep_loss:.4f} | train_acc {ep_acc:.3f} "
          f"| val_acc {val_acc:.4f} | best {best_acc:.4f} | {time.time()-t0:.1f}s")

# Convert `cfg` object to dictionary and ensure that `dtype` is serializable (as a string)
cfg_dict = asdict(cfg)
cfg_dict['amp_dtype'] = str(cfg.amp_dtype)  # Convert dtype to string

# Save the results in JSON format
with open("q1_results.json", "w") as f:
    json.dump({"best_test_acc": best_acc, "history": history, "cfg": cfg_dict}, f, indent=2)

print("✅ Best test accuracy:", f"{best_acc*100:.2f}%")


  scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)


Epoch 001/100 | loss 2.1820 | train_acc 0.205 | val_acc 0.1619 | best 0.1619 | 59.0s
Epoch 002/100 | loss 2.0464 | train_acc 0.287 | val_acc 0.2852 | best 0.2852 | 56.8s
Epoch 003/100 | loss 1.9244 | train_acc 0.356 | val_acc 0.3144 | best 0.3144 | 58.0s
Epoch 004/100 | loss 1.8727 | train_acc 0.395 | val_acc 0.3437 | best 0.3437 | 58.1s
Epoch 005/100 | loss 1.8025 | train_acc 0.430 | val_acc 0.3915 | best 0.3915 | 58.4s
Epoch 006/100 | loss 1.7579 | train_acc 0.453 | val_acc 0.4367 | best 0.4367 | 57.5s
Epoch 007/100 | loss 1.7238 | train_acc 0.471 | val_acc 0.4823 | best 0.4823 | 57.7s
Epoch 008/100 | loss 1.7220 | train_acc 0.475 | val_acc 0.5209 | best 0.5209 | 57.4s
Epoch 009/100 | loss 1.6813 | train_acc 0.498 | val_acc 0.5489 | best 0.5489 | 59.8s
Epoch 010/100 | loss 1.6490 | train_acc 0.514 | val_acc 0.5788 | best 0.5788 | 58.2s
Epoch 011/100 | loss 1.6584 | train_acc 0.517 | val_acc 0.6008 | best 0.6008 | 57.9s
Epoch 012/100 | loss 1.6179 | train_acc 0.530 | val_acc 0.6206 | 

In [None]:
# @title Q1: Print final accuracy
with open("q1_results.json") as f:
    r = json.load(f)
print(f"Best CIFAR-10 Average Accuracy (%): {100*r['best_test_acc']:.2f}")


Best CIFAR-10 Average Accuracy (%): 88.48
