<a href="https://colab.research.google.com/github/gabriellarumapea/VisionTransformer-Comparison/blob/main/VisionTransformer_Comparison" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os  # Add this import

# Change directory
%cd /content/VisionTransformer-Comparison

# Now this will work
print(">> Workspace READY:", os.getcwd())

/content/VisionTransformer-Comparison
>> Workspace READY: /content/VisionTransformer-Comparison


In [5]:
# Hapus semua yang salah
!rm -rf /content/.git
!rm -rf /content/VisionTransformer-Comparison

# Buat folder proyek BARU
%mkdir -p /content/VisionTransformer-Comparison
%cd /content/VisionTransformer-Comparison

print(">> Workspace READY:", os.getcwd())

shell-init: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory
/content/VisionTransformer-Comparison
>> Workspace READY: /content/VisionTransformer-Comparison


In [6]:
import os

# Buat folder
for folder in ['src', 'results', 'checkpoints']:
    os.makedirs(folder, exist_ok=True)

print(">> Folder structure created")
print(">> PROJECT ROOT:", os.getcwd())

>> Folder structure created
>> PROJECT ROOT: /content/VisionTransformer-Comparison


In [7]:
%%writefile src/dataset.py
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
from pathlib import Path

class ImageClassificationDataset(Dataset):
    def __init__(self, data_dir, transform=None, class_names=None):
        self.data_dir = Path(data_dir)
        self.transform = transform

        if class_names is not None:
            self.classes = class_names
        else:
            self.classes = sorted([d.name for d in self.data_dir.iterdir() if d.is_dir()])

        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        self.samples = []

        for class_dir in self.data_dir.iterdir():
            if class_dir.is_dir():
                for img_path in class_dir.glob("*.*"):
                    if img_path.suffix.lower() in ['.jpg', '.png', '.jpeg']:
                        self.samples.append((img_path, self.class_to_idx[class_dir.name]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

Writing src/dataset.py


In [8]:
%%writefile src/models.py
import timm
import torch.nn as nn

def create_model(model_name, num_classes, pretrained=True):
    return timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)

def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    size_mb = total * 4 / (1024 * 1024)
    return {
        'Total Parameters': total,
        'Trainable Parameters': trainable,
        'Model Size (MB)': size_mb
    }

Writing src/models.py


In [9]:
%%writefile src/training.py
import torch
from tqdm import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, epochs=10):
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    best_acc = 0.0
    model.to(device)

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0

        pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')
        for images, labels in pbar:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            pbar.set_postfix({'Loss': f'{train_loss/total:.4f}', 'Acc': f'{100.*correct/total:.2f}%'})

        train_acc = 100. * correct / total
        train_loss_avg = train_loss / len(train_loader)

        # Validation
        val_results = evaluate_model(model, val_loader, criterion, device)
        scheduler.step(val_results['val_loss'])

        # Save history
        history['train_loss'].append(train_loss_avg)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_results['val_loss'])
        history['val_acc'].append(val_results['val_acc'])

        # Checkpoint
        if val_results['val_acc'] > best_acc:
            best_acc = val_results['val_acc']
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_acc': best_acc,
                'history': history
            }, 'checkpoints/best_model.pth')

        print(f"Epoch {epoch+1} | Train: {train_acc:.2f}% | Val: {val_results['val_acc']:.2f}%")

    return history

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return {
        'val_loss': total_loss / len(data_loader),
        'val_acc': 100. * correct / total,
        'predictions': all_preds,
        'labels': all_labels
    }

Writing src/training.py


In [10]:
%%writefile src/evaluation.py
import time
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix

def measure_inference_time(model, data_loader, device, num_images=100):
    model.eval()

    # Warm-up
    with torch.no_grad():
        for i, (images, _) in enumerate(data_loader):
            if i >= 10:
                break
            images = images.to(device)
            _ = model(images)

    # Measure
    times = []
    total_images = 0

    with torch.no_grad():
        for images, _ in data_loader:
            if total_images >= num_images:
                break

            images = images.to(device)
            batch_size = images.size(0)

            start_time = time.time()
            _ = model(images)
            if device.type == 'cuda':
                torch.cuda.synchronize()
            batch_time = time.time() - start_time

            times.append(batch_time)
            total_images += batch_size

    avg_time_per_batch = np.mean(times)
    avg_time_per_image_ms = (avg_time_per_batch / data_loader.batch_size) * 1000
    throughput = data_loader.batch_size / avg_time_per_batch

    return {
        'Avg Time per Image (ms)': avg_time_per_image_ms,
        'Throughput (images/sec)': throughput,
        'Total Time (100 images)': sum(times) * 1000
    }

def compute_metrics(predictions, labels, classes):
    report = classification_report(labels, predictions, target_names=classes, output_dict=True, zero_division=0)
    cm = confusion_matrix(labels, predictions)
    return report, cm

Writing src/evaluation.py


In [11]:
%%writefile src/visualization.py
import matplotlib.pyplot as plt
import seaborn as sns

def plot_learning_curves(histories, save_path):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    for model_name, history in histories.items():
        epochs = range(1, len(history['train_loss']) + 1)

        ax1.plot(epochs, history['train_loss'], label=f'{model_name} Train', linestyle='--')
        ax1.plot(epochs, history['val_loss'], label=f'{model_name} Val')

        ax2.plot(epochs, history['train_acc'], label=f'{model_name} Train', linestyle='--')
        ax2.plot(epochs, history['val_acc'], label=f'{model_name} Val')

    ax1.set_title('Training & Validation Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)

    ax2.set_title('Training & Validation Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

def plot_confusion_matrix(cm, classes, model_name, save_path):
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

Writing src/visualization.py


In [12]:
%%writefile src/__init__.py
# Package initialization

Writing src/__init__.py


In [13]:
%%writefile download_dataset.py
import gdown
import zipfile
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil

def download_and_organize():
    print(">> DOWNLOAD AND ORGANIZE DATASET")
    print("="*60)

    # Remove existing
    if os.path.exists("dataset_organized"):
        print(">> Removing existing dataset_organized...")
        shutil.rmtree("dataset_organized")

    # Download
    if not os.path.exists("dataset.zip"):
        print(">> Downloading dataset...")
        FILE_ID = "1o3rl6Ap4QjxM5-C9WtiJMIgIwWVFK_ZO"  # GANTI FILE ID KAMU
        url = f"https://drive.google.com/uc?id={FILE_ID}"
        gdown.download(url, "dataset.zip", quiet=False)

    # Extract
    print(">> Extracting...")
    with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
        zip_ref.extractall("dataset")

    # Clean macosx
    mac_path = Path("dataset") / "__MACOSX"
    if mac_path.exists():
        shutil.rmtree(mac_path)

    # Read CSV
    data_path = Path("dataset")
    csv_path = data_path / "train.csv"
    df = pd.read_csv(csv_path).dropna()
    classes = sorted(df['label'].unique())

    print(f">> Found {len(df)} images, {len(classes)} classes: {classes}")

    # Create folders
    organized_path = Path("dataset_organized")
    for split in ['train', 'val']:
        for cls in classes:
            (organized_path / split / cls).mkdir(parents=True, exist_ok=True)

    # Stratified split
    train_files, val_files = [], []
    for cls in classes:
        cls_files = df[df['label'] == cls]['filename'].tolist()
        train_names, val_names = train_test_split(cls_files, test_size=0.2, random_state=42)
        train_files.extend([(n, cls) for n in train_names])
        val_files.extend([(n, cls) for n in val_names])

    # Copy files
    print(">> Copying train files...")
    for filename, cls in train_files:
        src = data_path / "train" / filename
        dst = organized_path / "train" / cls / filename
        if src.exists():
            shutil.copy2(src, dst)

    print(">> Copying val files...")
    for filename, cls in val_files:
        src = data_path / "train" / filename
        dst = organized_path / "val" / cls / filename
        if src.exists():
            shutil.copy2(src, dst)

    # Cleanup
    os.remove("dataset.zip")
    print(f">> Dataset organized: {len(train_files)} train, {len(val_files)} val")
    print(f">> Path: dataset_organized")

if __name__ == "__main__":
    download_and_organize()

Writing download_dataset.py


In [14]:
%%writefile main.py
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from pathlib import Path
import json
import os
import sys
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='PIL')

# Add src to path
sys.path.append(str(Path(__file__).parent / "src"))

from dataset import ImageClassificationDataset
from models import create_model, count_parameters
from training import train_model, evaluate_model
from evaluation import measure_inference_time, compute_metrics
from visualization import plot_learning_curves, plot_confusion_matrix

def main():
    print(">> VISION TRANSFORMER COMPARISON")
    print("="*80)

    # Config (REKOMENDED: 10 epochs untuk test cepat, ganti jadi 15 untuk full)
    CONFIG = {
        'dataset_path': 'dataset_organized',
        'img_size': 224,
        'batch_size': 32,
        'epochs': 10,  # GANTI JADI 15 UNTUK FULL TRAINING
        'learning_rate': 1e-4,
        'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        'num_classes': None,
        'class_names': None
    }

    print(f">> DEVICE: {CONFIG['device']}")

    # Check dataset
    if not Path(CONFIG['dataset_path']).exists():
        print(">> ERROR: Dataset not found. Run 'python download_dataset.py' first")
        return

    # Load classes
    train_path = Path(CONFIG['dataset_path']) / "train"
    classes = sorted([d.name for d in train_path.iterdir() if d.is_dir()])
    CONFIG['num_classes'] = len(classes)
    CONFIG['class_names'] = classes

    print(f">> CLASSES ({len(classes)}): {classes}")

    # Transforms
    train_transform = transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    val_transform = transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # DataLoaders
    train_dataset = ImageClassificationDataset(CONFIG['dataset_path'] + "/train", train_transform, classes)
    val_dataset = ImageClassificationDataset(CONFIG['dataset_path'] + "/val", val_transform, classes)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2, pin_memory=True if CONFIG['device'].type == 'cuda' else False)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2, pin_memory=True if CONFIG['device'].type == 'cuda' else False)

    print(f">> DATALOADERS: {len(train_loader)} train batches, {len(val_loader)} val batches")

    # Models
    MODEL_CONFIGS = {
        'ViT-Base': 'vit_base_patch16_224',
        'Swin-Base': 'swin_base_patch4_window7_224',
        'DeiT-Base': 'deit_base_patch16_224'
    }

    results = {}
    os.makedirs('results', exist_ok=True)

    for model_name, model_arch in MODEL_CONFIGS.items():
        print(f"\n{'='*80}")
        print(f">> TRAINING {model_name}")
        print(f"{'='*80}")

        model = create_model(model_arch, CONFIG['num_classes'])
        params = count_parameters(model)

        print(f">> PARAMETERS: {params['Total Parameters']:,} (total) | {params['Model Size (MB)']:.2f} MB")

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=0.05)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)

        # Train
        history = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, CONFIG['device'], CONFIG['epochs'])

        # Evaluate
        eval_results = evaluate_model(model, val_loader, criterion, CONFIG['device'])
        inference_stats = measure_inference_time(model, val_loader, CONFIG['device'])
        report, cm = compute_metrics(eval_results['predictions'], eval_results['labels'], classes)

        # Save results
        results[model_name] = {
            'parameters': {k: f"{v:,}" if isinstance(v, int) else f"{v:.2f}" for k, v in params.items()},
            'performance': {
                'accuracy': f"{eval_results['val_acc']:.2f}%",
                'precision': f"{report['weighted avg']['precision']:.4f}",
                'recall': f"{report['weighted avg']['recall']:.4f}",
                'f1_score': f"{report['weighted avg']['f1-score']:.4f}"
            },
            'inference': {k: f"{v:.2f}" if isinstance(v, float) else v for k, v in inference_stats.items()},
            'history': history
        }

        # Save metrics
        with open(f'results/metrics_{model_name}.json', 'w') as f:
            json.dump(results[model_name], f, indent=2)

        # Plots
        plot_confusion_matrix(cm, classes, model_name, f'results/cm_{model_name}.png')

        print(f">> {model_name} COMPLETED: {eval_results['val_acc']:.2f}% accuracy")

    # Final plots
    plot_learning_curves({name: r['history'] for name, r in results.items()}, 'results/learning_curves.png')

    # Summary table
    summary = []
    for name, r in results.items():
        summary.append({
            'Model': name,
            'Accuracy': r['performance']['accuracy'],
            'Params': r['parameters']['Total Parameters'],
            'Inference_ms': r['inference']['Avg Time per Image (ms)'],
            'F1_Score': r['performance']['f1_score']
        })

    # Save summary
    with open('results/summary.json', 'w') as f:
        json.dump(summary, f, indent=2)

    # Print summary
    import pandas as pd
    print("\n" + "="*80)
    print("FINAL SUMMARY")
    print("="*80)
    print(pd.DataFrame(summary).to_string(index=False))

    print("\n>> ALL EXPERIMENTS COMPLETED!")
    print(">> Results saved in results/ folder")

if __name__ == "__main__":
    main()

Writing main.py


In [15]:
%%writefile requirements.txt
torch>=1.12.0
torchvision>=0.13.0
timm>=0.9.0
Pillow>=9.0.0
matplotlib>=3.5.0
seaborn>=0.11.0
numpy>=1.21.0
pandas>=1.4.0
tqdm>=4.64.0
scikit-learn>=1.1.0
gdown>=4.6.0

Writing requirements.txt


In [16]:
%%writefile .gitignore
# Dataset & Large Files
dataset/
dataset_organized/
*.zip
*.tar
*.gz
*.tgz

# Model Checkpoints
*.pth
*.pt
checkpoints/

# Python Cache
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
*.egg-info/

# Jupyter
.ipynb_checkpoints
*.ipynb

# IDE & OS
.vscode/
.idea/
.DS_Store
Thumbs.db

# Colab specific
.config/
sample_data/

Writing .gitignore


In [17]:
!pip install -r requirements.txt



In [18]:
!python download_dataset.py

>> DOWNLOAD AND ORGANIZE DATASET
>> Downloading dataset...
Downloading...
From (original): https://drive.google.com/uc?id=1o3rl6Ap4QjxM5-C9WtiJMIgIwWVFK_ZO
From (redirected): https://drive.google.com/uc?id=1o3rl6Ap4QjxM5-C9WtiJMIgIwWVFK_ZO&confirm=t&uuid=d5772b25-953d-4c6b-8b9e-ead3afb2b5ef
To: /content/VisionTransformer-Comparison/dataset.zip
100% 259M/259M [00:05<00:00, 47.5MB/s]
>> Extracting...
>> Found 1108 images, 5 classes: ['bakso', 'gado_gado', 'nasi_goreng', 'rendang', 'soto_ayam']
>> Copying train files...
>> Copying val files...
>> Dataset organized: 885 train, 223 val
>> Path: dataset_organized


In [19]:
import torch
print(f"DEVICE: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

import os
print("Dataset exists:", os.path.exists("dataset_organized"))
!ls -lh dataset_organized/train/

DEVICE: cuda
Dataset exists: True
total 20K
drwxr-xr-x 2 root root 4.0K Nov 12 08:27 bakso
drwxr-xr-x 2 root root 4.0K Nov 12 08:27 gado_gado
drwxr-xr-x 2 root root 4.0K Nov 12 08:27 nasi_goreng
drwxr-xr-x 2 root root 4.0K Nov 12 08:27 rendang
drwxr-xr-x 2 root root 4.0K Nov 12 08:27 soto_ayam


In [20]:
!python main.py

>> VISION TRANSFORMER COMPARISON
>> DEVICE: cuda
>> CLASSES (5): ['bakso', 'gado_gado', 'nasi_goreng', 'rendang', 'soto_ayam']
>> DATALOADERS: 28 train batches, 7 val batches

>> TRAINING ViT-Base
model.safetensors: 100% 346M/346M [00:02<00:00, 125MB/s]
>> PARAMETERS: 85,802,501 (total) | 327.31 MB
Epoch 1/10: 100% 28/28 [00:31<00:00,  1.11s/it, Loss=0.0311, Acc=62.49%]
Epoch 1 | Train: 62.49% | Val: 94.62%
Epoch 2/10: 100% 28/28 [00:32<00:00,  1.16s/it, Loss=0.0061, Acc=94.12%]
Epoch 2 | Train: 94.12% | Val: 96.86%
Epoch 3/10: 100% 28/28 [00:31<00:00,  1.12s/it, Loss=0.0021, Acc=98.19%]
Epoch 3 | Train: 98.19% | Val: 96.41%
Epoch 4/10: 100% 28/28 [00:32<00:00,  1.17s/it, Loss=0.0018, Acc=98.31%]
Epoch 4 | Train: 98.31% | Val: 96.86%
Epoch 5/10: 100% 28/28 [00:30<00:00,  1.10s/it, Loss=0.0010, Acc=98.19%]
Epoch 5 | Train: 98.19% | Val: 97.76%
Epoch 6/10: 100% 28/28 [00:31<00:00,  1.13s/it, Loss=0.0034, Acc=96.61%]
Epoch 6 | Train: 96.61% | Val: 97.31%
Epoch 7/10: 100% 28/28 [00:31<00:0

In [21]:
!nvidia-smi

Wed Nov 12 08:49:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P0             34W /   70W |       2MiB /  15360MiB |     24%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [22]:
%%writefile resume.py
import torch
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).parent / "src"))

def resume_training(checkpoint_path, model, optimizer, scheduler):
    """Load checkpoint dan resume"""
    if Path(checkpoint_path).exists():
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch']
        best_acc = checkpoint['best_acc']
        history = checkpoint['history']
        print(f">> Resumed from epoch {start_epoch} with best acc {best_acc:.2f}%")
        return start_epoch, best_acc, history
    return 0, 0, {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

Writing resume.py
