In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Set the directory where your images are stored
image_dir = '/content/drive/My Drive/cavallo'


Mounted at /content/drive


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, RandomCrop, RandomHorizontalFlip, ToTensor, Normalize
from PIL import Image
import os
import random

class ImageTilesDataset(torch.utils.data.Dataset):
    def __init__(self, directory, grid_size=(3, 3), tile_size=100):
        self.directory = directory
        self.grid_size = grid_size
        self.tile_size = tile_size
        self.data = []
        self.labels = []
        self.load_images()

    def load_images(self):
        file_count = 0
        for filename in os.listdir(self.directory):
            if filename.lower().endswith(('.png', '.jpeg', '.jpg')):
                file_count += 1
                image_path = os.path.join(self.directory, filename)
                image = Image.open(image_path).convert('RGB')
                self.jumble_image(image)
        if file_count == 0:
            print("No images loaded.")
        else:
            print(f"Loaded {file_count} images.")

    def jumble_image(self, image):
        image = Resize((self.grid_size[0] * self.tile_size, self.grid_size[1] * self.tile_size))(image)
        tiles = [image.crop((j * self.tile_size, i * self.tile_size, (j + 1) * self.tile_size, (i + 1) * self.tile_size))
                 for i in range(self.grid_size[0]) for j in range(self.grid_size[1])]
        indices = list(range(len(tiles)))
        random.shuffle(indices)
        tiles = [tiles[i] for i in indices]
        self.data.extend(tiles)
        self.labels.extend(indices)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        transform = Compose([
            Resize(256),  # Slightly larger resize
            RandomCrop(224),  # Random crop to the final size
            RandomHorizontalFlip(),  # Horizontal flip
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        tile = transform(self.data[idx])
        label = self.labels[idx]
        return tile, label

# Include your updated VisionTransformer and its component classes here.

class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=100):
        super().__init__()
        self.projection = nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.projection(x)  # (B, C, H/P, W/P)
        x = x.flatten(2)         # (B, C, N) where N is number of patches
        x = x.transpose(1, 2)    # (B, N, C)
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=8, dropout_rate=0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(emb_size, num_heads, dropout=dropout_rate)

    def forward(self, query, key, value):
        return self.attention(query, key, value)[0]

class TransformerEncoderLayer(nn.Module):
    def __init__(self, emb_size=768, num_heads=8, forward_expansion=4, dropout_rate=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(emb_size)
        self.attention = MultiHeadAttention(emb_size, num_heads, dropout_rate)
        self.norm2 = nn.LayerNorm(emb_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(emb_size, forward_expansion * emb_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * emb_size, emb_size),
            nn.Dropout(dropout_rate)
        )

    def forward(self, x):
        x = self.norm1(x)
        attention = self.attention(x, x, x)
        x = attention + x
        x = self.norm2(x)
        forward = self.feed_forward(x)
        out = forward + x
        return out

class VisionTransformer(nn.Module):
    def __init__(self, patch_size=16, emb_size=768, depth=6, num_heads=8, num_classes=9, img_size=100, dropout_rate=0.1):
        super().__init__()
        self.patch_embedding = PatchEmbedding(patch_size=patch_size, emb_size=emb_size, img_size=img_size)
        self.position_embeddings = nn.Parameter(torch.zeros((img_size // patch_size) ** 2 + 1, emb_size))
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
        self.transformer = nn.Sequential(*[TransformerEncoderLayer(emb_size, num_heads, dropout_rate=dropout_rate) for _ in range(depth)])
        self.to_cls_token = nn.Identity()
        self.fc = nn.Linear(emb_size, num_classes)

    def forward(self, x):
        x = self.patch_embedding(x)
        b, n, _ = x.shape
        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.position_embeddings[:n+1]
        x = self.transformer(x)
        x = self.to_cls_token(x[:, 0])
        return self.fc(x)

# Set up training components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_dir = '/content/drive/My Drive/cavallo'
dataset = ImageTilesDataset(image_dir)
loader = DataLoader(dataset, batch_size=9, shuffle=True, num_workers=4)
model = VisionTransformer(img_size=224, patch_size=32, num_classes=9, depth=6, num_heads=8).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

def train_model(model, loader, criterion, optimizer, scheduler, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for tiles, labels in loader:
            tiles, labels = tiles.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(tiles)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()  # Adjust the learning rate
        average_loss = total_loss / len(loader)
        print(f"Epoch {epoch+1}: Average Loss: {average_loss:.4f}")

train_model(model, loader, criterion, optimizer, scheduler)


Loaded 2623 images.
Epoch 1: Average Loss: 2.2626
Epoch 2: Average Loss: 2.2109
Epoch 3: Average Loss: 2.2063
Epoch 4: Average Loss: 2.1981
Epoch 5: Average Loss: 2.1979
Epoch 6: Average Loss: 2.1977
Epoch 7: Average Loss: 2.1974
Epoch 8: Average Loss: 2.1973
Epoch 9: Average Loss: 2.1974
Epoch 10: Average Loss: 2.1973


In [14]:
# def piecewise_accuracy(model, loader):
#     model.eval()
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for tiles, labels in loader:
#             tiles, labels = tiles.to(device), labels.to(device)
#             outputs = model(tiles)
#             _, predicted = torch.max(outputs, 1)
#             correct += (predicted == labels).sum().item()
#             total += labels.size(0)
#     return correct / total

def puzzle_accuracy(model, dataset):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i in range(len(dataset)):
            tiles, labels = dataset[i]
            tiles = tiles.unsqueeze(0).to(device)  # Add batch dimension
            labels = torch.tensor(labels).unsqueeze(0).to(device)  # Add batch dimension
            outputs = model(tiles)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).item()
            total += 1
    return correct / total

# # Calculate piece-wise accuracy and puzzle accuracy
# piecewise_acc = piecewise_accuracy(model, loader)
# print(f"Piece-wise Accuracy: {piecewise_acc:.4f}")

puzzle_acc = puzzle_accuracy(model, dataset)
print(f"Puzzle Accuracy: {puzzle_acc:.4f}")


Piece-wise Accuracy: 0.1111
Puzzle Accuracy: 0.1111
