In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

# Define the Vision Transformer (VIT) architecture
class VIT_3c(nn.Module):
    def __init__(self, image_width, image_height, patch_size, num_classes, dim, depth, heads, mlp_dim, dropout):
        super(VIT_3c, self).__init__()

        self.patch_embed = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size)

        num_patches_width = image_width // patch_size
        num_patches_height = image_height // patch_size

        # num_patches = (image_size // patch_size) ** 2
        num_patches = num_patches_width * num_patches_height

        patch_dim = dim * (patch_size ** 2)

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
            num_layers=depth
        )

        self.fc = nn.Linear(dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)  # Convert image to patches
        x = x.flatten(2).transpose(1, 2)  # Flatten patches and swap dimensions for transformer
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # Broadcast the class token
        x = torch.cat((cls_token, x), dim=1)  # Concatenate the class token to patches
        x += self.pos_embedding  # Add positional embeddings

        x = self.transformer_encoder(x)  # Transformer Encoder

        # Take only the class token's representation and pass through final linear layer
        x = x[:, 0]
        x = self.fc(x)
        return x


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

# Define the Vision Transformer (VIT) architecture
class VIT_2c(nn.Module):
    def __init__(self, image_width, image_height, patch_size, num_classes, dim, depth, heads, mlp_dim, dropout):
        super(VIT_2c, self).__init__()

        self.patch_embed = nn.Conv2d(1, dim, kernel_size=patch_size, stride=patch_size)

        num_patches_width = image_width // patch_size
        num_patches_height = image_height // patch_size

        # num_patches = (image_size // patch_size) ** 2
        num_patches = num_patches_width * num_patches_height

        patch_dim = dim * (patch_size ** 2)

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
            num_layers=depth
        )

        self.fc = nn.Linear(dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)  # Convert image to patches
        x = x.flatten(2).transpose(1, 2)  # Flatten patches and swap dimensions for transformer
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # Broadcast the class token
        x = torch.cat((cls_token, x), dim=1)  # Concatenate the class token to patches
        x += self.pos_embedding  # Add positional embeddings

        x = self.transformer_encoder(x)  # Transformer Encoder

        # Take only the class token's representation and pass through final linear layer
        x = x[:, 0]
        x = self.fc(x)
        return x


In [None]:
# MNIST
import torchvision.transforms as transforms
from torchvision.datasets import MNIST

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)), # Resize images to match the expected input size of the VIT model
    transforms.ToTensor(), # Convert images to PyTorch tensors
    transforms.Normalize((0.1307,), (0.3081,)) # Normalize pixel values using the mean and standard deviation of the MNIST dataset
])

# Load MNIST training dataset
train_dataset = MNIST(root='./data', train=True, download=True, transform=transform)

# Load MNIST test dataset
test_dataset = MNIST(root='./data', train=False, download=True, transform=transform)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 218481461.55it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 27789789.82it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 66426780.48it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 13165534.74it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [None]:
# HYPERPARAMETERS
from torch.utils.data import DataLoader

batch_size = 64

# Create data loaders for training and test datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
import torch.nn as nn
import torch.optim as optim

# Create VIT model
model = VIT_2c(
    image_width=32,
    image_height=32,
    patch_size=4,
    num_classes=10,
    dim=64,
    depth=6,
    heads=8,
    mlp_dim=128,
    dropout=0.1
)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# train MINIST
# Set model to training mode
from tqdm import tqdm

# train MNIST
# Set model to training mode
model.train()

# Create a list to store the losses over time
train_losses = []

# Wrap the data loader in a tqdm object
train_loader = tqdm(train_loader)

for batch_idx, (data, target) in enumerate(train_loader):
    # Zero out gradients from previous iteration
    optimizer.zero_grad()

    # Make predictions using VIT model
    output = model(data)

    # Calculate loss
    loss = criterion(output, target)

    # Backpropagate loss to compute gradients
    loss.backward()

    # Update model weights
    optimizer.step()

    # Append the loss to the list of losses
    train_losses.append(loss.item())

    # Set the description of the progress bar to show the loss
    train_loader.set_description(f'Loss: {loss.item():.4f}')


# Set model to evaluation mode
model.eval()

correct = 0

with torch.no_grad():
    for data, target in test_loader:
        # Make predictions using VIT model
        output = model(data)

        # Get index of max log-probability as predicted class
        pred = output.argmax(dim=1)

        # Count number of correct predictions
        correct += pred.eq(target).sum().item()

# Calculate accuracy as percentage of correct predictions
accuracy = 100. * correct / len(test_loader.dataset)

print(f'Test accuracy: {accuracy:.2f}%')


Loss: 2.2810: 100%|██████████| 938/938 [07:25<00:00,  2.11it/s]


Test accuracy: 9.58%


In [None]:
# Hyperparameters
image_size = 32
patch_size = 8
num_classes = 10  # CIFAR-10 has 10 classes
dim = 128
depth = 6
heads = 8
mlp_dim = 256
dropout = 0.1
batch_size = 64
epochs = 100
learning_rate = 0.005

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
import matplotlib.pyplot as plt


# Data preparation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to [-1, 1]
])

# Load the entire CIFAR-10 dataset
full_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)

# Split the dataset into training, validation, and test sets
num_samples = len(full_dataset)
train_size = int(0.8 * num_samples)
val_size = int(0.1 * num_samples)
test_size = num_samples - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VIT(image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop with logging of losses
train_losses = []
val_losses = []
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
    train_loss /= len(train_loader.dataset)
    train_losses.append(train_loss)

    # Calculate validation loss
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
    val_loss /= len(val_loader.dataset)
    val_losses.append(val_loss)

    print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

print("Training finished!")

# Plotting the losses over time
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Over Time')
plt.show()

# Finally, evaluate the model on the test set
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader.dataset)
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Files already downloaded and verified
Epoch [1/100], Train Loss: 2.3260, Val Loss: 2.3067
Epoch [2/100], Train Loss: 2.3056, Val Loss: 2.3049
Epoch [3/100], Train Loss: 2.3047, Val Loss: 2.3034
Epoch [4/100], Train Loss: 2.3047, Val Loss: 2.3037
Epoch [5/100], Train Loss: 2.3043, Val Loss: 2.3026
Epoch [6/100], Train Loss: 2.3044, Val Loss: 2.3031
Epoch [7/100], Train Loss: 2.3046, Val Loss: 2.3036
Epoch [8/100], Train Loss: 2.3041, Val Loss: 2.3050
Epoch [9/100], Train Loss: 2.3039, Val Loss: 2.3046
Epoch [10/100], Train Loss: 2.3039, Val Loss: 2.3032
Epoch [11/100], Train Loss: 2.3040, Val Loss: 2.3042
Epoch [12/100], Train Loss: 2.3040, Val Loss: 2.3040
Epoch [13/100], Train Loss: 2.3040, Val Loss: 2.3038
Epoch [14/100], Train Loss: 2.3040, Val Loss: 2.3071
Epoch [15/100], Train Loss: 2.3040, Val Loss: 2.3052
Epoch [16/100], Train Loss: 2.3041, Val Loss: 2.3028
Epoch [17/100], Train Loss: 2.3040, Val Loss: 2.3033
Epoch [18/100], Train Loss: 2.3040, Val Loss: 2.3049
Epoch [19/100], T

In [None]:
import torch

# Create a 4D tensor with shape (2, 3, 4, 5)
inputs = torch.randn(2, 3, 4, 5)
print(f"Original shape: {inputs.shape}")

# Reshape the tensor using *inputs.shape[2:]
reshaped_inputs = inputs.view(-1, *inputs.shape[4:])
print(f"Reshaped shape: {reshaped_inputs.shape}")
[*inputs.shape[2:]]

Original shape: torch.Size([2, 3, 4, 5])
Reshaped shape: torch.Size([120])


[4, 5]