# Project 6. Apply algorithms to train an image classifier

Here's a real-life applied Python code example for training an image classifier using Vision Transformers (ViT) with transfer learning:

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torchvision.models.vision_transformer import VisionTransformer

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set parameters
num_classes = 10
batch_size = 32
num_epochs = 10
learning_rate = 0.001

# Load and preprocess the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])

train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained ResNet-50 model
resnet = resnet50(pretrained=True)
resnet.fc = nn.Linear(2048, num_classes)  # Replace the fully connected layer

# Freeze the layers of ResNet
for param in resnet.parameters():
    param.requires_grad = False

# Create Vision Transformer (ViT) model
vit = VisionTransformer(
    img_size=224,
    patch_size=16,
    num_classes=num_classes,
    dim=768,
    depth=12,
    heads=12,
    mlp_dim=3072,
    dropout=0.1,
    emb_dropout=0.1
)

# Combine ResNet and ViT models
model = nn.Sequential(resnet, vit)

# Move the model to the device
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Move tensors to the device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print training progress
        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{total_step}], Loss: {loss.item():.4f}")

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")


In this code example, we leverage transfer learning by using a pre-trained ResNet-50 model as a feature extractor and combine it with a Vision Transformer (ViT) model for image classification. The code utilizes the CIFAR-10 dataset for demonstration purposes.

First, the dataset is loaded and preprocessed using standard transformations. The ResNet-50 model is loaded and modified by replacing the fully connected layer to match the number of classes in the dataset. The ResNet layers are frozen to prevent their weights from being updated during training.

The Vision Transformer (ViT) model is then created with specific parameters like image size, patch size, number of classes, depth, heads, etc. The ResNet and ViT models are combined into a single sequential model.

Next, the model is moved to the available device (GPU if available). The loss function (CrossEntropyLoss) and optimizer (Adam) are defined.

The training loop iterates through the dataset, performs forward and backward passes, and optimizes the model parameters. The progress is printed periodically to monitor the training process.

After training, the model is evaluated on the test set. The accuracy is calculated by comparing the predicted labels with the ground truth labels.

This code provides a starting point for training an image classifier using Vision Transformers with transfer learning. Additional modifications and optimizations can be made based on specific requirements and datasets.