In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import timm  # For advanced vision transformers
from tqdm.auto import tqdm

In [3]:
num_epochs = 15
batch_size = 16
size = 224

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
# =============================
# 1. Define the Prototypical Network
# =============================
class PrototypicalNetwork(nn.Module):
    def __init__(self, embedding_dim=128):
        """
        This network uses an advanced Vision Transformer (DeiT-Small) from timm.
        - It loads a pretrained model (which is under 250 MB in disk size).
        - It freezes all parameters except the last two transformer blocks for fine-tuning.
        - It adds an embedding head mapping the backbone output to a 128-dim embedding.
        - It extracts the class token from the backbone output if necessary.
        """
        super(PrototypicalNetwork, self).__init__()
        
        # Create a pretrained DeiT-Small model.
        # Note: 'deit_small_patch16_224' is trained on 224x224 images.
        # We will change the input resolution via transforms (to 400x400).
        self.backbone = timm.create_model('deit_small_patch16_224', pretrained=True)
        
        # Freeze all parameters in the backbone.
        for param in self.backbone.parameters():
            param.requires_grad = False
        
        # Unfreeze the last two transformer blocks for fine-tuning.
        # For DeiT, the transformer blocks are stored in self.backbone.blocks.
        if hasattr(self.backbone, 'blocks'):
            for block in self.backbone.blocks[-2:]:
                for param in block.parameters():
                    param.requires_grad = True
        else:
            print("Warning: Could not find transformer blocks in backbone.")
        
        # Get the backbone's embedding dimension.
        # For deit_small_patch16_224, the embedding dimension is stored in backbone.embed_dim.
        backbone_embed_dim = self.backbone.embed_dim
        
        # Create an embedding layer mapping the backbone output to our desired embedding_dim.
        self.embedding_layer = nn.Linear(backbone_embed_dim, embedding_dim)
    
    def forward(self, x):
        """
        Forward pass:
        - x: input tensor of shape [B, C, H, W], where H=W=400 (see transforms).
        - The backbone returns features. For DeiT, it may return a tensor of shape [B, T, D],
          where T is the number of tokens (e.g. 197 for 224x224 inputs).
        - We select the class token (first token) so that the output is [B, D].
        - Then we map through the embedding layer and L2-normalize the output.
        """
        features = self.backbone.forward_features(x)
        # Check if the output has a token dimension (i.e. 3 dimensions).
        if features.ndim == 3:
            # Select the class token (first token).
            features = features[:, 0, :]  # Now shape becomes [B, D]
        # Else, if features are already [B, D], do nothing.
        embedding = self.embedding_layer(features)  # Shape: [B, embedding_dim]
        # Normalize the embedding to unit length (beneficial for metric learning).
        embedding = F.normalize(embedding, p=2, dim=1)
        return embedding

In [6]:
# =============================
# 2. Define the Prototypical Loss Function
# =============================
def prototypical_loss(embeddings, labels):
    """
    Given:
      - embeddings: Tensor of shape [B, embedding_dim] (one embedding per image).
      - labels: Tensor of shape [B] (integer class labels, provided by ImageFolder).
      
    Steps:
      1. Identify unique classes in the batch.
      2. For each class, compute the prototype as the mean embedding of all samples in that class.
      3. Remap the original labels to a batch-local index (0 to n_unique-1).
      4. Compute the Euclidean distance between each sample's embedding and each prototype.
      5. Use negative distances as logits (so that a smaller distance yields a higher score).
      6. Compute cross-entropy loss with the remapped targets.
      7. Also compute accuracy.
    """
    # Ensure labels is a 1D tensor.
    labels = labels.view(-1)
    
    # Get unique labels in the batch.
    unique_labels = torch.unique(labels)
    prototypes = []   # To hold the computed prototype for each class.
    target_idxs = torch.empty_like(labels)
    
    # For each unique label, compute the prototype (mean embedding).
    for i, label in enumerate(unique_labels):
        mask = (labels == label)
        # Calculate mean embedding for this class.
        class_proto = embeddings[mask].mean(dim=0)
        prototypes.append(class_proto)
        # Map original label to a new index (0, 1, ..., n_unique-1).
        target_idxs[mask] = i
    prototypes = torch.stack(prototypes)  # Shape: [n_unique, embedding_dim]
    
    # Compute pairwise Euclidean distances.
    # embeddings: [B, D] → unsqueeze to [B, 1, D]
    # prototypes: [n_unique, D] → unsqueeze to [1, n_unique, D]
    # The difference and squared sum yields a tensor of shape [B, n_unique].
    dists = torch.sqrt(((embeddings.unsqueeze(1) - prototypes.unsqueeze(0)) ** 2).sum(dim=-1) + 1e-8)
    
    # Convert distances to logits by taking negative distances.
    logits = -dists  # Shape: [B, n_unique]
    
    # Compute cross-entropy loss.
    loss = F.cross_entropy(logits, target_idxs.long())
    preds = torch.argmax(logits, dim=1)
    acc = (preds == target_idxs).float().mean()
    return loss, acc

In [7]:
transform = transforms.Compose([
    transforms.Resize((size, size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dir = "G:/github_repos/tourist_sites_2/train"
test_dir  = "G:/github_repos/tourist_sites_2/test"

# Create the datasets.
train_dataset = datasets.ImageFolder(train_dir, transform=transform)
test_dataset  = datasets.ImageFolder(test_dir, transform=transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [8]:
# =============================
# 4. Instantiate Model and Optimizer
# =============================
model = PrototypicalNetwork(embedding_dim=128).to(device)

# Only parameters with requires_grad=True (last two transformer blocks + embedding head) are updated.
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
# =============================
# 5. Training and Testing Loop
# =============================

for epoch in tqdm(range(num_epochs)):
    model.train()
    running_loss = 0.0
    running_acc = 0.0
    batch_count = 0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        # Forward pass: expect embeddings of shape [B, 128]
        embeddings = model(images)
        loss, acc = prototypical_loss(embeddings, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        running_acc += acc.item()
        batch_count += 1
    
    avg_train_loss = running_loss / batch_count
    avg_train_acc = running_acc / batch_count
    
    # Evaluate on test set.
    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    test_batches = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            embeddings = model(images)
            loss, acc = prototypical_loss(embeddings, labels)
            test_loss += loss.item()
            test_acc += acc.item()
            test_batches += 1
    avg_test_loss = test_loss / test_batches
    avg_test_acc = test_acc / test_batches
    
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f} | "
          f"Test Loss: {avg_test_loss:.4f}, Test Acc: {avg_test_acc:.4f}")


  7%|█████▌                                                                             | 1/15 [00:35<08:16, 35.44s/it]

Epoch 1/15 | Train Loss: 1.5141, Train Acc: 0.9975 | Test Loss: 0.8984, Test Acc: 1.0000


 13%|███████████                                                                        | 2/15 [01:06<07:07, 32.91s/it]

Epoch 2/15 | Train Loss: 1.4802, Train Acc: 0.9925 | Test Loss: 0.8256, Test Acc: 1.0000


 20%|████████████████▌                                                                  | 3/15 [01:38<06:27, 32.28s/it]

Epoch 3/15 | Train Loss: 1.4400, Train Acc: 0.9962 | Test Loss: 0.7714, Test Acc: 0.9911


 27%|██████████████████████▏                                                            | 4/15 [02:09<05:53, 32.12s/it]

Epoch 4/15 | Train Loss: 1.3892, Train Acc: 0.9988 | Test Loss: 0.7403, Test Acc: 0.9821


 33%|███████████████████████████▋                                                       | 5/15 [02:47<05:40, 34.03s/it]

Epoch 5/15 | Train Loss: 1.3968, Train Acc: 0.9962 | Test Loss: 0.7337, Test Acc: 0.9821


 40%|█████████████████████████████████▏                                                 | 6/15 [03:21<05:07, 34.12s/it]

Epoch 6/15 | Train Loss: 1.4022, Train Acc: 0.9975 | Test Loss: 0.7159, Test Acc: 0.9911


 47%|██████████████████████████████████████▋                                            | 7/15 [03:52<04:24, 33.11s/it]

Epoch 7/15 | Train Loss: 1.3594, Train Acc: 0.9975 | Test Loss: 0.6978, Test Acc: 1.0000


 53%|████████████████████████████████████████████▎                                      | 8/15 [04:24<03:49, 32.75s/it]

Epoch 8/15 | Train Loss: 1.3536, Train Acc: 0.9988 | Test Loss: 0.6901, Test Acc: 0.9911


 60%|█████████████████████████████████████████████████▊                                 | 9/15 [04:58<03:19, 33.22s/it]

Epoch 9/15 | Train Loss: 1.3501, Train Acc: 1.0000 | Test Loss: 0.6835, Test Acc: 1.0000


 67%|██████████████████████████████████████████████████████▋                           | 10/15 [05:30<02:43, 32.78s/it]

Epoch 10/15 | Train Loss: 1.3231, Train Acc: 1.0000 | Test Loss: 0.6704, Test Acc: 1.0000


 73%|████████████████████████████████████████████████████████████▏                     | 11/15 [06:02<02:09, 32.40s/it]

Epoch 11/15 | Train Loss: 1.3646, Train Acc: 1.0000 | Test Loss: 0.6620, Test Acc: 0.9911


 80%|█████████████████████████████████████████████████████████████████▌                | 12/15 [06:33<01:36, 32.18s/it]

Epoch 12/15 | Train Loss: 1.3490, Train Acc: 0.9988 | Test Loss: 0.6593, Test Acc: 1.0000


 87%|███████████████████████████████████████████████████████████████████████           | 13/15 [07:05<01:03, 31.97s/it]

Epoch 13/15 | Train Loss: 1.3134, Train Acc: 0.9988 | Test Loss: 0.6578, Test Acc: 1.0000


 93%|████████████████████████████████████████████████████████████████████████████▌     | 14/15 [07:36<00:31, 31.78s/it]

Epoch 14/15 | Train Loss: 1.3100, Train Acc: 1.0000 | Test Loss: 0.6539, Test Acc: 0.9911


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [08:07<00:00, 32.51s/it]

Epoch 15/15 | Train Loss: 1.3246, Train Acc: 1.0000 | Test Loss: 0.6480, Test Acc: 1.0000





In [10]:
# =============================
# 6. Compute and Save Final Class Prototypes
# =============================
# After training, compute a prototype (mean embedding) for each class using the entire training set.
model.eval()
class_prototypes = {}  # Dictionary: {class_index: prototype_tensor}
num_classes = len(train_dataset.classes)
# Dictionary to accumulate embeddings per class.
all_embeddings = {i: [] for i in range(num_classes)}

In [11]:
with torch.no_grad():
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)
        embeddings = model(images)  # Shape: [B, 128]
        for emb, label in zip(embeddings, labels):
            all_embeddings[label.item()].append(emb.cpu())
            
# Compute the mean embedding (prototype) for each class.
for cls, emb_list in all_embeddings.items():
    class_prototypes[cls] = torch.stack(emb_list).mean(dim=0)

In [12]:
# Save the full model.
torch.save(model, "full_prototypical_model.pth")

# Save the prototypes dictionary.
torch.save(class_prototypes, "class_prototypes.pth")

print("Full model and prototypes saved!")

Full model and prototypes saved!


In [None]:
ls = [
     'Abu Simbel Temple',
     'Bibliotheca Alexandrina',
     'Nefertari Temple',
     'Saint Catherine Monastery',
     'Citadel of Saladin',
     'Monastery of St. Simeon',
     'AlAzhar Mosque',
     'Fortress of Shali in Siwa',
     'Greek_Orthodox_Cemetery in Alexandria',
     'Hanging Church',
     'khan el khalili',
     'Luxor Temple',
     'Baron_empain',
     'New Alamein City',
     'Philae Temple',
     'Pyramid of Djoser',
     'Salt lake at Siwa',
     'Wadi Al-Hitan',
     'White Desert',
     'Cairo Opera House',
     'Tahrir Square',
     'Cairo tower',
     'Citadel of Qaitbay',
     'Eg musuem',
     'Great Pyramids of Giza',
     'Hatshepsut temple',
     'Meidum pyramid',
     'Royal Montaza Palace'
]

In [14]:
from PIL import Image
import torch

def predict_single_image(image_path, model, prototypes, transform, device, class_names=None):
    """
    Predicts the class of a single image.
    
    Parameters:
      - image_path: Path to the image file.
      - model: Trained model that returns an embedding.
      - prototypes: Dictionary {class_index: prototype_tensor}.
      - transform: Transform pipeline used during training.
      - device: 'cuda' or 'cpu'.
      - class_names: (Optional) List mapping class indices to names.
      
    Returns:
      - pred_class: Predicted class index.
      - distances: Dictionary of L2 distances for each class.
    """
    # Load and preprocess the image.
    img = Image.open(image_path).convert('RGB')
    img_tensor = transform(img).unsqueeze(0).to(device)  # Shape: [1, C, H, W]
    
    # Extract embedding.
    model.eval()
    with torch.no_grad():
        embedding = model(img_tensor)  # Shape: [1, embedding_dim]
    
    # Compute L2 distances to each prototype, ensuring the prototype is on the correct device.
    distances = {cls: torch.norm(embedding - proto.to(device).unsqueeze(0)).item() 
                 for cls, proto in prototypes.items()}
    
    # Predict the class with the smallest distance.
    pred_class = min(distances, key=distances.get)
    
    if class_names:
        print(f"Predicted class: {class_names[pred_class]} with distance {distances[pred_class]:.4f}")
    else:
        print(f"Predicted class index: {pred_class} with distance {distances[pred_class]:.4f}")
    
    return pred_class, distances

#Use the prediction function for the provided image path.
image_path = "G:/github_repos/tourist_sites_2/22.jpg"
#pred_class, distances = predict_single_image(image_path, model, class_prototypes, transform, device, class_names=train_dataset.classes)

Predicted class: 2-Baron_empain with distance 0.6200


In [15]:
# saving the parameters of the model

torch.save(model.state_dict(), "model_state.pth")