# ðŸ“Š **COMPARING AND CONTRASTING VISION MODELS** ðŸ“Š

In [9]:
# REASSESS v5 (with correct architecture)

import os
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

Q6_PATH = '/Users/voodoo/Documents/Artificial Intelligence/Q6'
if Q6_PATH not in sys.path:
    sys.path.insert(0, Q6_PATH)

from lab7 import create_mappings, get_cifar100_vocabulary
from lab8 import (
    CIFAR100Filtered, filter_dataset_indices,
    create_data_splits, create_dataloaders, collect_embeddings,
    compute_alignment_metrics, print_analysis_results
)

device = torch.device("mps")

# Custom ImageEncoder with v5 architecture (LARGE head)
class ImageEncoderLarge(nn.Module):
    def __init__(self, proj_dim=32, device="mps"):
        super().__init__()
        self.device = device
        base = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.DEFAULT)
        self.backbone = nn.Sequential(*list(base.children())[:-1]).to(device).eval()
        for p in self.backbone.parameters():
            p.requires_grad = False
        
        # LARGE projection head (v5)
        self.projection = nn.Sequential(
            nn.Linear(576, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, proj_dim)
        ).to(device)
    
    def forward(self, x):
        with torch.no_grad():
            feats = self.backbone(x).flatten(1)
        out = self.projection(feats)
        return feats, out

# Load text embeddings
checkpoint_text = torch.load('EMB32_NG10_CS2_BS64.pth', map_location='cpu')
nodes = checkpoint_text['nodes']
embeddings = checkpoint_text['model_state_dict']['center_embeddings.weight'].numpy()
word_to_idx, idx_to_word = create_mappings(nodes)

# Get CIFAR vocab and create mappings
cifar_vocab = get_cifar100_vocabulary()
class_words = [w for w in cifar_vocab if w in word_to_idx]
label_to_word = {i: word for i, word in enumerate(cifar_vocab)}
label_to_emb_idx = {i: word_to_idx[word] for i, word in enumerate(cifar_vocab) if word in word_to_idx}

# Create text embeddings tensor
training_text_emb = torch.tensor(
    [embeddings[word_to_idx[word]] for word in class_words],
    dtype=torch.float32
).to(device)
training_text_emb = F.normalize(training_text_emb, p=2, dim=1)

# Create dataloaders
train_full = CIFAR100Filtered(root="./data", split="train")
test_full = CIFAR100Filtered(root="./data", split="val")
all_train_idx = filter_dataset_indices(train_full, label_to_emb_idx)
test_idx = filter_dataset_indices(test_full, label_to_emb_idx)
train_idx, val_idx = create_data_splits(all_train_idx, val_ratio=0.2, seed=42)
dataloaders = create_dataloaders(train_idx, val_idx, test_idx, {'train': 1024, 'eval': 256})

# Load and evaluate v5 with LARGE architecture
checkpoint_v5 = torch.load('new_config_clip_vision_model_v5.pth', map_location=device)

print("=== REASSESSING v5 ===")
print(f"Loaded from epoch: {checkpoint_v5['epoch']}")
print(f"Val Similarity: {checkpoint_v5['val_similarity']:.4f}")

vision_model_v5 = ImageEncoderLarge(proj_dim=32, device=device)
vision_model_v5.load_state_dict(checkpoint_v5['model_state_dict'], strict=False)
vision_model_v5.eval()

visual_emb_v5, all_labels_v5 = collect_embeddings(vision_model_v5, dataloaders['test'], device)

class_stats_v5, i2t_recalls_v5, t2i_recalls_v5, sim_matrix_v5 = compute_alignment_metrics(
    visual_emb_v5, all_labels_v5, training_text_emb.cpu().numpy(), class_words, label_to_word
)

print_analysis_results(class_stats_v5, i2t_recalls_v5, t2i_recalls_v5, len(all_labels_v5), len(class_words))


Loading CIFAR-100 vocabulary...
âœ“ CIFAR-100 vocabulary loaded: 100 classes
=== REASSESSING v5 ===
Loaded from epoch: 38
Val Similarity: 0.4199


Collecting embeddings: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 40/40 [00:19<00:00,  2.01it/s]


ðŸ“Š Per-Class Similarity Analysis:
----------------------------------------------------------------------

Top 10 Best Aligned Classes:
 1. bicycle         | Mean: 0.6283 Â± 0.1384
 2. fox             | Mean: 0.6136 Â± 0.0656
 3. rabbit          | Mean: 0.6095 Â± 0.0624
 4. wolf            | Mean: 0.6065 Â± 0.0596
 5. lion            | Mean: 0.5972 Â± 0.0834
 6. leopard         | Mean: 0.5854 Â± 0.0847
 7. seal            | Mean: 0.5836 Â± 0.0521
 8. tiger           | Mean: 0.5776 Â± 0.0974
 9. snake           | Mean: 0.5740 Â± 0.0880
10. spider          | Mean: 0.5733 Â± 0.0745

Bottom 10 Worst Aligned Classes:
 1. can             | Mean: 0.3315 Â± 0.1440
 2. mountain        | Mean: 0.3262 Â± 0.1898
 3. tank            | Mean: 0.3237 Â± 0.1439
 4. skyscraper      | Mean: 0.3233 Â± 0.1746
 5. baby            | Mean: 0.3144 Â± 0.1366
 6. palm_tree       | Mean: 0.3065 Â± 0.1884
 7. willow_tree     | Mean: 0.3020 Â± 0.1601
 8. lamp            | Mean: 0.2896 Â± 0.1973
 9. ray           


  ret = a @ b
  ret = a @ b
  ret = a @ b
