In [None]:
!pip install -q torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
!pip install -q timm

In [None]:
%%bash
cat > train.py <<'PY'
# (SCRIPT STARTS HERE: see the code block later in this file)
PY

In [None]:
# train.py — ViT for CIFAR-10 (single-file, Colab-ready)
# Usage example (in Colab):
# !python train.py --epochs 200 --batch-size 128 --lr 3e-3 --patch-size 4 --embed-dim 256 --depth 10 --num-heads 8

import argparse
import math
import os
import random
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# -------------------------- Model pieces --------------------------
class PatchEmbed(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_chans=3, embed_dim=256):
        super().__init__()
        assert img_size % patch_size == 0, 'Image dimensions must be divisible by patch size.'
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) * (img_size // patch_size)
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)                        # [B, embed_dim, H/ps, W/ps]
        x = x.flatten(2).transpose(1, 2)       # [B, num_patches, embed_dim]
        return x

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, dropout=0.):
        super().__init__()
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, in_features)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=True, attn_dropout=0., proj_dropout=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_dropout)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_dropout)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # each: [B, nh, N, head_dim]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

def drop_path(x, drop_prob: float = 0., training: bool = False):
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()
    output = x.div(keep_prob) * random_tensor
    return output

class DropPath(nn.Module):
    def __init__(self, drop_prob=None):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_dropout=attn_drop, proj_dropout=drop)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = nn.LayerNorm(dim)
        mlp_hidden = int(dim * mlp_ratio)
        self.mlp = MLP(dim, hidden_features=mlp_hidden, dropout=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_chans=3, num_classes=10,
                 embed_dim=256, depth=10, num_heads=8, mlp_ratio=4., qkv_bias=True,
                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        # class token + positional embeddings
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        # transformer blocks
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
        self.blocks = nn.ModuleList([
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                  drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i]) for i in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)

        # classifier head
        self.head = nn.Linear(embed_dim, num_classes)

        # init
        try:
            nn.init.trunc_normal_(self.pos_embed, std=0.02)
            nn.init.trunc_normal_(self.cls_token, std=0.02)
        except Exception:
            # older torch versions may not have trunc_normal_; fallback to normal_
            nn.init.normal_(self.pos_embed, std=0.02)
            nn.init.normal_(self.cls_token, std=0.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            try:
                nn.init.trunc_normal_(m.weight, std=0.02)
            except Exception:
                nn.init.normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)                 # [B, N, C]
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)   # [B, 1+N, C]
        x = x + self.pos_embed
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        cls = x[:, 0]
        x = self.head(cls)
        return x

# -------------------------- Utilities --------------------------
def accuracy(output, target, topk=(1,)):
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        assert 0.0 <= smoothing < 1.0
        self.smoothing = smoothing

    def forward(self, pred, target):
        logprobs = F.log_softmax(pred, dim=-1)
        nll = -logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = (1.0 - self.smoothing) * nll + self.smoothing * smooth_loss
        return loss.mean()

class SimpleMixup:
    def __init__(self, alpha=0.8, device='cuda'):
        self.alpha = alpha
        self.device = device

    def __call__(self, x, y):
        if self.alpha <= 0:
            return x, y
        lam = np.random.beta(self.alpha, self.alpha)
        batch_size = x.size(0)
        index = torch.randperm(batch_size).to(self.device)
        mixed_x = lam * x + (1 - lam) * x[index, :]
        y_a, y_b = y, y[index]
        return mixed_x, (y_a, y_b, lam)

class MixupLoss:
    def __init__(self, criterion):
        self.criterion = criterion

    def __call__(self, preds, targets):
        if isinstance(targets, tuple):
            y_a, y_b, lam = targets
            return lam * self.criterion(preds, y_a) + (1 - lam) * self.criterion(preds, y_b)
        else:
            return self.criterion(preds, targets)

# cosine lr scheduler with warmup
def build_scheduler(optimizer, total_epochs, steps_per_epoch, lr, warmup_epochs=10):
    total_steps = total_epochs * steps_per_epoch
    warmup_steps = warmup_epochs * steps_per_epoch

    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return LambdaLR(optimizer, lr_lambda)

# -------------------------- Training / Validation loops --------------------------
def train_one_epoch(model, loader, optimizer, device, epoch, loss_fn, scheduler=None, mixup_fn=None):
    model.train()
    running_loss = 0.0
    top1 = 0.0
    total = 0
    for i, (x, y) in enumerate(loader):
        x = x.to(device)
        y = y.to(device)
        if mixup_fn is not None:
            x, y = mixup_fn(x, y)
            # When Mixup is active, the target `y` is a tuple (y_a, y_b, lam).
            # We need to use the original labels y_a for calculating accuracy.
            y_for_accuracy = y[0]
        else:
            y_for_accuracy = y


        logits = model(x)
        loss = loss_fn(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        # Use y_for_accuracy for accuracy calculation
        acc1 = accuracy(logits.detach().cpu(), y_for_accuracy.detach().cpu(), topk=(1,))[0]
        running_loss += loss.item() * x.size(0)
        top1 += acc1.item() * x.size(0)
        total += x.size(0)

    return running_loss / total, top1 / total

@torch.no_grad()
def validate(model, loader, device, loss_fn):
    model.eval()
    running_loss = 0.0
    top1 = 0.0
    total = 0
    for x, y in loader:
        x = x.to(device)
        y = y.to(device)
        logits = model(x)
        loss = loss_fn(logits, y)
        acc1 = accuracy(logits.detach().cpu(), y.detach().cpu(), topk=(1,))[0]
        running_loss += loss.item() * x.size(0)
        top1 += acc1.item() * x.size(0)
        total += x.size(0)
    return running_loss / total, top1 / total

# -------------------------- Main & argparsing --------------------------
def main(args):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('Device:', device)

    # reproducibility (optional)
    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if device == 'cuda':
            torch.cuda.manual_seed_all(args.seed)

    # data transforms
    normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
    train_transforms = [
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
    ]
    if args.rand_augment:
        # torchvision's RandAugment requires reasonably recent torchvision
        try:
            train_transforms.append(transforms.RandAugment())
        except Exception:
            pass
    train_transforms += [transforms.ToTensor(), normalize]
    train_transform = transforms.Compose(train_transforms)

    test_transform = transforms.Compose([transforms.ToTensor(), normalize])

    # datasets
    train_set = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
    test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

    train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=min(8, os.cpu_count() or 1), pin_memory=True)
    test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=min(8, os.cpu_count() or 1), pin_memory=True)

    # model
    model = VisionTransformer(img_size=32, patch_size=args.patch_size, num_classes=10,
                              embed_dim=args.embed_dim, depth=args.depth, num_heads=args.num_heads,
                              mlp_ratio=args.mlp_ratio, drop_rate=args.dropout, attn_drop_rate=args.attn_dropout,
                              drop_path_rate=args.drop_path_rate)
    model.to(device)

    # criterion
    if args.label_smoothing > 0:
        base_criterion = LabelSmoothingCrossEntropy(smoothing=args.label_smoothing)
    else:
        base_criterion = nn.CrossEntropyLoss()

    if args.mixup_alpha > 0:
        mixup_fn = SimpleMixup(alpha=args.mixup_alpha, device=device)
        criterion = MixupLoss(base_criterion)
    else:
        mixup_fn = None
        criterion = base_criterion

    # optimizer + scheduler
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    steps_per_epoch = len(train_loader)
    scheduler = build_scheduler(optimizer, total_epochs=args.epochs, steps_per_epoch=steps_per_epoch, lr=args.lr, warmup_epochs=args.warmup_epochs)

    # checkpoint directory
    os.makedirs(args.output_dir, exist_ok=True)

    best_acc = 0.0
    global_step = 0
    for epoch in range(args.epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device, epoch, criterion, scheduler=scheduler, mixup_fn=mixup_fn)
        val_loss, val_acc = validate(model, test_loader, device, base_criterion)

        print(f"Epoch {epoch+1}/{args.epochs} — train loss: {train_loss:.4f} — train acc: {train_acc:.2f}% — val loss: {val_loss:.4f} — val acc: {val_acc:.2f}%")

        # save best
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save({'epoch': epoch+1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'best_acc': best_acc}, os.path.join(args.output_dir, 'best.pth'))

    print('Training finished. Best val acc: %.2f%%' % best_acc)

    # Save the final model checkpoint
    torch.save({'epoch': args.epochs, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'best_acc': best_acc}, os.path.join(args.output_dir, 'final.pth'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ViT CIFAR10 — single-file')
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--lr', type=float, default=3e-3)
    parser.add_argument('--weight-decay', type=float, default=0.05)
    parser.add_argument('--warmup-epochs', type=int, default=10)
    parser.add_argument('--patch-size', type=int, default=4)
    parser.add_argument('--embed-dim', type=int, default=256)
    parser.add_argument('--depth', type=int, default=10)
    parser.add_argument('--num-heads', type=int, default=8)
    parser.add_argument('--mlp-ratio', type=float, default=4.0)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--attn-dropout', type=float, default=0.0)
    parser.add_argument('--drop-path-rate', type=float, default=0.1)
    parser.add_argument('--label-smoothing', type=float, default=0.1)
    parser.add_argument('--mixup-alpha', type=float, default=0.8)
    parser.add_argument('--rand-augment', action='store_true')
    parser.add_argument('--output-dir', type=str, default='./checkpoints')
    parser.add_argument('--seed', type=int, default=42)
    args = parser.parse_args()

    main(args)# save final checkpoint
torch.save({'epoch': epoch+1, 'model_state': model.state_dict(), 'best_acc': best_acc},
           os.path.join(args.output_dir, 'final.pth'))
print("Final checkpoint saved:", os.path.join(args.output_dir, 'final.pth'))



usage: colab_kernel_launcher.py [-h] [--epochs EPOCHS]
                                [--batch-size BATCH_SIZE] [--lr LR]
                                [--weight-decay WEIGHT_DECAY]
                                [--warmup-epochs WARMUP_EPOCHS]
                                [--patch-size PATCH_SIZE]
                                [--embed-dim EMBED_DIM] [--depth DEPTH]
                                [--num-heads NUM_HEADS]
                                [--mlp-ratio MLP_RATIO] [--dropout DROPOUT]
                                [--attn-dropout ATTN_DROPOUT]
                                [--drop-path-rate DROP_PATH_RATE]
                                [--label-smoothing LABEL_SMOOTHING]
                                [--mixup-alpha MIXUP_ALPHA] [--rand-augment]
                                [--output-dir OUTPUT_DIR] [--seed SEED]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-792014bf-bc9e-44de-9666-cee6567772f8.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Basic run (uses GPU). Tune args below as desired.
!python train.py --epochs 200 --batch-size 128 --lr 3e-3 --weight-decay 0.05 \
--patch-size 4 --embed-dim 256 --depth 10 --num-heads 8 --mlp-ratio 4.0 --label-smoothing 0.1

usage: train.py [-h] [--epochs EPOCHS] [--batch-size BATCH_SIZE] [--lr LR]
                [--weight-decay WEIGHT_DECAY] [--warmup-epochs WARMUP_EPOCHS]
                [--patch-size PATCH_SIZE] [--embed-dim EMBED_DIM]
                [--depth DEPTH] [--num-heads NUM_HEADS]
                [--mlp-ratio MLP_RATIO] [--dropout DROPOUT]
                [--attn-dropout ATTN_DROPOUT]
                [--drop-path-rate DROP_PATH_RATE]
                [--label-smoothing LABEL_SMOOTHING]
                [--mixup-alpha MIXUP_ALPHA] [--rand-augment]
                [--output-dir OUTPUT_DIR] [--seed SEED]
train.py: error: unrecognized arguments: --mlp-dim 512


In [None]:
!pip install timm  # only if you use timm for schedulers, optional
!pip install torch torchvision  # usually preinstalled in Colab

# 2. (Optional) Verify GPU
!nvidia-smi


Thu Oct  2 20:28:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# =======================
# RUN TRAINING
# =======================

!python train.py \
  --epochs 100 \
  --batch-size 128 \
  --lr 3e-4 \
  --weight-decay 0.05 \
  --embed-dim 256 \
  --mlp-ratio 4.0 \
  --depth 8 \
  --num-heads 8 \
  --patch-size 4 \
  --dropout 0.1 \
  --attn-dropout 0.0 \
  --label-smoothing 0.1 \
  --rand-augment \
  --output-dir ./vit_cifar10_out \
  --seed 42

Device: cuda
Epoch 1/100 — train loss: 2.2272 — train acc: 14.57% — val loss: 2.0412 — val acc: 25.68%
Epoch 2/100 — train loss: 2.1510 — train acc: 18.04% — val loss: 1.9191 — val acc: 32.42%
Epoch 3/100 — train loss: 2.0968 — train acc: 20.71% — val loss: 1.8644 — val acc: 36.18%
Epoch 4/100 — train loss: 2.0437 — train acc: 23.68% — val loss: 1.7677 — val acc: 42.72%
Epoch 5/100 — train loss: 1.9889 — train acc: 24.64% — val loss: 1.6560 — val acc: 46.56%
Epoch 6/100 — train loss: 1.9413 — train acc: 27.40% — val loss: 1.5888 — val acc: 51.20%
Epoch 7/100 — train loss: 1.9015 — train acc: 28.32% — val loss: 1.5205 — val acc: 53.48%
Epoch 8/100 — train loss: 1.8864 — train acc: 29.37% — val loss: 1.4708 — val acc: 56.59%
Epoch 9/100 — train loss: 1.8646 — train acc: 29.53% — val loss: 1.4703 — val acc: 56.35%
Epoch 10/100 — train loss: 1.8577 — train acc: 30.38% — val loss: 1.4985 — val acc: 54.48%
Epoch 11/100 — train loss: 1.8352 — train acc: 31.61% — val loss: 1.3768 — val acc: 60

In [None]:
torch.save(model.state_dict(), "vit_cifar10_best.pth")

NameError: name 'torch' is not defined

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test accuracy: {100.0 * correct / total:.2f}%")


NameError: name 'model' is not defined

In [None]:
import os, torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

# --- Locate checkpoint ---
ckpt_dir = './vit_cifar10_out'
ckpt_path = None
if os.path.exists(ckpt_dir):
    for f in os.listdir(ckpt_dir):
        if f.endswith('.pth') or f.endswith('.pt'):
            ckpt_path = os.path.join(ckpt_dir, f)
            break
print("Using checkpoint:", ckpt_path)

if ckpt_path is None:
    raise FileNotFoundError("No checkpoint found in ./vit_cifar10_out. Train or save first.")

# --- Build model (must match training config) ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = VisionTransformer(img_size=32, patch_size=4, num_classes=10,
                          embed_dim=256, depth=8, num_heads=8,
                          mlp_ratio=4.0, drop_rate=0.1, attn_drop_rate=0.0,
                          drop_path_rate=0.1).to(device)

ckpt = torch.load(ckpt_path, map_location=device)
if isinstance(ckpt, dict) and 'model_state' in ckpt:
    state = ckpt['model_state']
elif isinstance(ckpt, dict) and any(k in ckpt for k in ['state_dict','model_state_dict']):
    for k in ['state_dict','model_state','model_state_dict']:
        if k in ckpt: state = ckpt[k]; break
else:
    state = ckpt
try:
    model.load_state_dict(state)
except RuntimeError:
    from collections import OrderedDict
    new_state = OrderedDict((k.replace('module.',''), v) for k,v in state.items())
    model.load_state_dict(new_state)

# --- CIFAR-10 test loader ---
normalize = transforms.Normalize(mean=[0.4914,0.4822,0.4465], std=[0.247,0.243,0.261])
test_transform = transforms.Compose([transforms.ToTensor(), normalize])
test_loader = DataLoader(
    datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform),
    batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# --- Evaluate ---
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        preds = model(imgs).argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

acc = 100.0 * correct / total
print(f"✅ Test accuracy: {acc:.2f}%")

Using checkpoint: None


FileNotFoundError: No checkpoint found in ./vit_cifar10_out. Train or save first.

In [None]:
!ls ./checkpoints


ls: cannot access './checkpoints': No such file or directory


In [21]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
%cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [28]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/Colab Notebooks/.git/


In [29]:
!git remote add origin https://github.com/devikapmenon/AIRL-Assignment.git

In [30]:
!git add q1.ipynb q2.ipynb README.md


fatal: pathspec 'README.md' did not match any files


In [34]:
!git commit -m "Initial commit — CIFAR-10 ViT submission"


On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mUntitled[m
	[31mUntitled0.ipynb[m
	[31mUntitled1.ipynb[m
	[31mmodel_training (1).ipynb[m
	[31mmodel_training (2).ipynb[m
	[31mmodel_training.ipynb[m
	[31mq1.ipynb[m
	[31mq2 (1).ipynb[m
	[31mq2.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [33]:
!git config --global user.email "aadikrishna4008@gmail.com"
!git config --global user.name "devikapmenon"

In [37]:
!git commit -m "Initial commit — CIFAR-10 ViT submission"


On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mUntitled[m
	[31mUntitled0.ipynb[m
	[31mUntitled1.ipynb[m
	[31mmodel_training (1).ipynb[m
	[31mmodel_training (2).ipynb[m
	[31mmodel_training.ipynb[m
	[31mq1.ipynb[m
	[31mq2 (1).ipynb[m
	[31mq2.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [38]:
!git branch -M main
!git push -u origin main


error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/devikapmenon/AIRL-Assignment.git'
[m

In [36]:
!git add q1.ipynb q2.ipynb README.md

fatal: pathspec 'README.md' did not match any files
