# CLIP with Flowers!?!?!??!?

In [4]:
import torch
import torchvision
#!pip install openai-clip
import clip
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

## Dataset Functions

We define utility functions for:
- **`get_data()`**: Load Flowers102 from torchvision
- **`base_novel_categories()`**: Split 102 classes into base (0-50) and novel (51-101)
- **`split_data()`**: Filter images for base/novel in each split

This simulates the real scenario: we have 51 seen classes during training (base) and 51 new ones (novel).


In [5]:
def get_data(data_dir="./data", transform=None):
    """Load Flowers102 train, validation and test sets."""
    train = torchvision.datasets.Flowers102(root=data_dir, split="train", download=True, transform=transform)
    val = torchvision.datasets.Flowers102(root=data_dir, split="val", download=True, transform=transform)
    test = torchvision.datasets.Flowers102(root=data_dir, split="test", download=True, transform=transform)
    return train, val, test

def base_novel_categories(dataset):
    all_classes = set(dataset._labels)
    num_classes = len(all_classes)
    base_classes = list(range(num_classes))[:num_classes//2]
    novel_classes = list(range(num_classes))[num_classes//2:]
    return base_classes, novel_classes

def split_data(dataset, base_classes):
    base_categories_samples = []
    novel_categories_samples = []
    base_set = set(base_classes)
    
    for sample_id, label in enumerate(dataset._labels):
        if label in base_set:
            base_categories_samples.append(sample_id)
        else:
            novel_categories_samples.append(sample_id)
    
    base_dataset = torch.utils.data.Subset(dataset, base_categories_samples)
    novel_dataset = torch.utils.data.Subset(dataset, novel_categories_samples)
    return base_dataset, novel_dataset

## Class Names and Dataset Loading

We load the names of 102 flower classes from Flowers102.

This is **critical** for CLIP:
- Creates prompts like "a photo of a **rose**, a type of flower"
- Each prompt is encoded by CLIP's text encoder
- Image features are compared against these text templates


In [6]:
_, _, tmp_test = get_data()
base_classes, novel_classes = base_novel_categories(tmp_test)

CLASS_NAMES = ["pink primrose", "hard-leaved pocket orchid", "canterbury bells", "sweet pea", "english marigold", "tiger lily", "moon orchid", "bird of paradise", "monkshood", "globe thistle", "snapdragon", "colt's foot", "king protea", "spear thistle", "yellow iris", "globe-flower", "purple coneflower", "peruvian lily", "balloon flower", "giant white arum lily", "fire lily", "pincushion flower", "fritillary", "red ginger", "grape hyacinth", "corn poppy", "prince of wales feathers", "stemless gentian", "artichoke", "sweet william", "carnation", "garden phlox", "love in the mist", "mexican aster", "alpine sea holly", "ruby-lipped cattleya", "cape flower", "great masterwort", "siam tulip", "lenten rose", "barbeton daisy", "daffodil", "sword lily", "poinsettia", "bolero deep blue", "wallflower", "marigold", "buttercup", "oxeye daisy", "common dandelion", "petunia", "wild pansy", "primula", "sunflower", "pelargonium", "bishop of llandaff", "gaura", "geranium", "orange dahlia", "pink-yellow dahlia?", "cautleya spicata", "japanese anemone", "black-eyed susan", "silverbush", "californian poppy", "osteospermum", "spring crocus", "bearded iris", "windflower", "tree poppy", "gazania", "azalea", "water lily", "rose", "thorn apple", "morning glory", "passion flower", "lotus", "toad lily", "anthurium", "frangipani", "clematis", "hibiscus", "columbine", "desert-rose", "tree mallow", "magnolia", "cyclamen", "watercress", "canna lily", "hippeastrum", "bee balm", "ball moss", "foxglove", "bougainvillea", "camellia", "mallow", "mexican petunia", "bromelia", "blanket flower", "trumpet creeper", "blackberry lily"]

# Uncomment to see class names
# print("Base Class Names:", [(i, CLASS_NAMES[i]) for i in base_classes])
# print("Novel Class Names:", [(i, CLASS_NAMES[i]) for i in novel_classes])

In [7]:
# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device=device)
print(f"Device: {device}")
print(f"Model: ViT-B/16")

Device: cpu
Model: ViT-B/16


## Load Flowers102 and Split Base/Novel

We load the 3 splits (train, val, test) and divide into base/novel.

**Statistics:**
- Train Base: 10 images √ó 51 classes = 510 images
- Val Base: 10 images √ó 51 classes = 510 images
- Test Base: ~10 images √ó 51 classes (from test split)
- Test Novel: Remaining (~10 per class)

**Note:** Train and val have ~10 images per class (few-shot setting).


In [8]:
# get the three datasets
train_set, val_set, test_set = get_data(transform=preprocess)

# split classes into base and novel
base_classes, novel_classes = base_novel_categories(train_set)

# split the three datasets
train_base, _ = split_data(train_set, base_classes)
val_base, _ = split_data(val_set, base_classes)
test_base, test_novel = split_data(test_set, base_classes)

print(f"Train Base: {len(train_base)} samples")
print(f"Val Base: {len(val_base)} samples")
print(f"Test Base: {len(test_base)} samples")
print(f"Test Novel: {len(test_novel)} samples")

Train Base: 510 samples
Val Base: 510 samples
Test Base: 2473 samples
Test Novel: 3676 samples


## Zero-Shot CLIP Evaluation

We evaluate the original CLIP **without any training**:

1. For each class, create a fixed prompt: "a photo of a {class_name}, a type of flower"

2. Encode the prompt with text encoder ‚Üí text_features (512-dim)

3. Encode the image with vision encoder ‚Üí image_features (512-dim)

4. Cosine similarity between image and text features ‚Üí logits

5. Prediction = class with highest similarity

**Expected results:**
- Base Accuracy: ~71% (CLIP is not specialized on this dataset)
- Novel Accuracy: ~78% (CLIP generalizes better on new classes)

In [None]:
@torch.no_grad()
def eval(model, dataset, categories, batch_size, device, label=""):
    """Zero-shot evaluation using fixed CLIP templates"""
    model.eval()
    contig_cat2idx = {cat: idx for idx, cat in enumerate(categories)}
    
    text_inputs = clip.tokenize(
        [f"a photo of a {CLASS_NAMES[c]}, a type of flower." for c in categories]
    ).to(device)
    
    text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    correct_predictions = 0
    for image, target in tqdm(dataloader, desc=label):
        target = torch.Tensor([contig_cat2idx[t.item()] for t in target]).long()
        
        image = image.to(device)
        target = target.to(device)
        
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        
        predicted_class = (image_features @ text_features.T).argmax(dim=-1)
        correct_predictions += (predicted_class == target).sum().item()
    
    accuracy = correct_predictions / len(dataset)
    return accuracy

# Compute zero-shot baseline
print("\n" + "="*60)
print("üîµ ZERO-SHOT BASELINE")
print("="*60)

base_accuracy = eval(model=model, dataset=test_base, categories=base_classes, batch_size=64, device=device, label="Zero-shot evaluation on Base Classes")
novel_accuracy = eval(model=model, dataset=test_novel, categories=novel_classes, batch_size=64, device=device, label="Zero-shot evaluation on Novel Classes")

print(f"\nüîç Base classes accuracy: {base_accuracy*100:.2f}%")
print(f"üîç Novel classes accuracy: {novel_accuracy*100:.2f}%")


üîµ ZERO-SHOT BASELINE


Zero-shot evaluation on Base Classes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [33:30<00:00, 51.56s/it]
Zero-shot evaluation on Novel Classes:  17%|‚ñà‚ñã        | 10/58 [08:31<40:49, 51.04s/it]

## Harmonic Mean (HM)

Standard metric for few-shot adaptation papers.

Formula: HM = 2 / (1/base_acc + 1/novel_acc)

**Why HM instead of arithmetic mean?**
- HM heavily penalizes outliers
- If base=90% and novel=50%: arithmetic mean=70%, HM=64.3%
- Forces the model to balance both accuracies

**Goal:** HM > 75% (improvement over zero-shot ~74.6%)


In [None]:
def harmonic_mean(base_accuracy, novel_accuracy):
    numerator = 2
    denominator = 1 / base_accuracy + 1 / novel_accuracy
    hm = numerator / denominator
    return hm

hm_zeroshot = harmonic_mean(base_accuracy, novel_accuracy)
print(f"üîç Harmonic Mean: {hm_zeroshot*100:.2f}%")


üîç Harmonic Mean: 74.60%


## MetaNetwork: Conditional Token Generator

**Problem:** Fixed prompts don't adapt to each image.

**Solution:** A small neural network that transforms image features into a conditional token.

**Parameters:** ~256K (negligible vs. fine-tuning)

**Effect:** Each image gets a different prompt ‚Üí instance-level adaptation


In [None]:
"""
MetaNetwork √® una piccola rete neurale (MLP con 2 layer)
che trasforma le image_features (512-dim) in un token
condizionale (512-dim) usato in CoCoOp.

Questo token varia per ogni immagine, permettendo prompt
personalizzati per ogni input.
"""

class MetaNetwork(nn.Module):
    def __init__(self, ctx_dim=512, hidden_dim=256):
        """
        Args:
            ctx_dim: dimensione degli embeddings (512 per ViT-B/16)
            hidden_dim: dimensione dello strato nascosto
        """
        super().__init__()
        self.linear1 = nn.Linear(ctx_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(hidden_dim, ctx_dim)

    def forward(self, image_features):
        """
        Args:
            image_features: tensor (B, ctx_dim) dalle immagini encodate
        
        Returns:
            conditional_token: tensor (B, ctx_dim)
        """
        # Assicura il tipo corretto (importante per mixed precision)
        image_features = image_features.to(self.linear1.weight.dtype)
        
        out = self.linear1(image_features)
        out = self.relu(out)
        out = self.linear2(out)
        return out


## CoCoOpPromptLearner: Dynamic Prompts


**Components:**
1. **V1...VM:** 16 context vectors (learned via SGD)
   - Shape: (16, 512) tensors
   - Initialized randomly from N(0, 0.02¬≤)
   - Optimized during training

2. **œÄ(x):** Conditional token (generated per image)
   - Shape: (B, 512) from MetaNetwork output
   - Different for each image

3. **[CLASS]:** Class name embedding
   - Shape: (seq_len, 512) from CLIP's token embedding
   - Same for all images of the same class

**Forward Pass:**
- Input: image_features (B, 512)
- Output: prompts (B, num_classes, seq_len_total, 512)


In [None]:
class CoCoOpPromptLearner(nn.Module):
    def __init__(self, clip_model, classnames, n_ctx=4):  # ‚Üê 4 context tokens
        super().__init__()
        self.n_ctx = n_ctx
        self.classnames = classnames
        dtype = clip_model.dtype
        ctx_dim = int(clip_model.ln_final.weight.shape[0])
        self.clip_context_length = clip_model.context_length
        print(f"[CoCoOp] ctx_dim={ctx_dim}, max_len={self.clip_context_length}")
        
        # 4 context vectors
        ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
        nn.init.normal_(ctx_vectors, std=0.02)
        self.ctx = nn.Parameter(ctx_vectors)
        
        self.meta_net = MetaNetwork(ctx_dim)
        
        device = next(clip_model.parameters()).device
        classnames_tokens = clip.tokenize(classnames).to(device)
        with torch.no_grad():
            self.register_buffer("class_token_embeddings", 
                               clip_model.token_embedding(classnames_tokens))
            # Save token ids for each class sequence so we can find the EOT position later
            self.register_buffer("class_token_ids", classnames_tokens)
        
    def forward(self, image_features):
        batch_size = image_features.shape[0]
        num_classes, class_len, ctx_dim = self.class_token_embeddings.shape
        
        # Conditional token per immagine
        cond = self.meta_net(image_features).unsqueeze(1).unsqueeze(2)  # B,1,1,D
        cond = cond.repeat(1, num_classes, 1, 1)                         # B,N,1,D
        
        # Context
        ctx = self.ctx.unsqueeze(0).unsqueeze(0).repeat(batch_size, num_classes, 1, 1)  # B,N,4,D
        
        # Class
        cls_emb = self.class_token_embeddings.unsqueeze(0).repeat(batch_size, 1, 1, 1)  # B,N,L,D
        
        # Concat: [ctx] + [œÄ(x)] + [class] 
        prompts = torch.cat([ctx, cond, cls_emb], dim=2)[:,:,:self.clip_context_length,:]
        return prompts


## CoCoOpTrainer: Training and Evaluation

Class that manages:

**1. Initialization:**
- Create PromptLearner
- Freeze CLIP (`requires_grad=False`)
- Configure SGD optimizer for prompt learner only

**2. train_epoch():**
- Forward: Image encoder + PromptLearner + Text encoder
- **Critical step:** Encode soft prompts through text transformer
  - Add positional embeddings
  - Pass through CLIP's transformer
  - Extract first token
  - Apply final layer norm + projection
- Compute loss: Cross-entropy on base classes
- Backward: Backprop only in PromptLearner
- Return: Average loss of the epoch

**3. eval():**
- Same forward procedure as training
- Without backward pass
- Compute accuracy on any dataset (base or novel)

**Important note:** We don't use `model.encode_text()` on soft prompts
because that method expects integer tokens, not embeddings.
We manually forward through the text transformer.

In [None]:
"""
CoCoOpTrainer fornisce:
1. train_epoch(): esegue un epoca di training su base classes
2. eval(): valuta su base o novel classes

Importante: CLIP rimane congelato, alleniamo solo i prompt learner!
"""

class CoCoOpTrainer:
    def __init__(self, clip_model, base_classnames, base_classes, 
                 novel_classes, device, lr=0.002):
        """
        Args:
            clip_model: modello CLIP caricato
            base_classnames: lista di nomi classi base
            base_classes: lista di indici base classes
            novel_classes: lista di indici novel classes
            device: "cuda" o "cpu"
            lr: learning rate
        """
        self.clip_model = clip_model
        self.base_classnames = base_classnames
        self.base_classes = base_classes
        self.novel_classes = novel_classes
        self.device = device
        
        # Precompute mapping from original class id -> contiguous index for base classes
        # This will be reused during training to remap dataset labels to [0, num_base_classes)
        self.contig_cat2idx = {cat: idx for idx, cat in enumerate(self.base_classes)}
        
        # Freeze CLIP parameters
        for p in clip_model.parameters():
            p.requires_grad = False
        
        # Crea il prompt learner
        self.prompt_learner = CoCoOpPromptLearner(
            clip_model, 
            base_classnames
        ).to(device)
        
        # Optimizer - allena solo il prompt learner
        self.optimizer = torch.optim.SGD(
            self.prompt_learner.parameters(),
            lr=lr,
            momentum=0.9,
            weight_decay=5e-4
        )
        
    def train_epoch(self, train_dataset, batch_size=32):
        """
        Esegue una epoca di training.
        """
        self.prompt_learner.train()
        self.clip_model.eval()
        
        dataloader = torch.utils.data.DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True, 
            num_workers=2
        )
        
        total_loss = 0
        n_batches = 0
        
        for batch_idx, (images, labels) in enumerate(tqdm(dataloader, desc="CoCoOp training")):
            images = images.to(self.device)
            labels = labels.to(self.device)
            
            # ===== FORWARD =====
            
            # Encode immagini (frozen)
            with torch.no_grad():
                img_feat = self.clip_model.encode_image(images)
            
            # Normalizza image features
            img_feat = img_feat.to(self.prompt_learner.meta_net.linear1.weight.dtype)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)
            
            # Genera prompts condizionali per ogni immagine
            prompts = self.prompt_learner(img_feat)  # (B, num_classes, seq_len, ctx_dim)
            B, N, L, D = prompts.shape
            
            # Reshape per poter passare al text encoder
            prompts_flat = prompts.view(B * N, L, D)
            
            # ===== TEXT ENCODING (Passaggio critico) =====
            # Passiamo gli embeddings direttamente attraverso il text encoder transformer
            x = prompts_flat  # (B*N, seq_len, 512)
            
            # Aggiungi positional embeddings
            x = x + self.clip_model.positional_embedding
            
            # Permuta per il transformer
            x = x.permute(1, 0, 2)  # (seq_len, B*N, 512)
            
            # Passa attraverso il transformer
            x = self.clip_model.transformer(x)
            
            # Ritorna al formato batch
            x = x.permute(1, 0, 2)  # (B*N, seq_len, 512)
            
            # Prendi il token EOT per ogni sequenza (come in CLIP.encode_text)
            # Costruiamo gli indici delle classi ripetuti per ogni batch in modo che
            # l'i-esimo elemento flat corrisponda alla classe (i % N). Usiamo i token ids
            # salvati in self.prompt_learner.class_token_ids per trovare la posizione EOT.
            class_indices = torch.arange(N, device=self.prompt_learner.class_token_ids.device).unsqueeze(0).repeat(B, 1).view(-1)
            token_ids = self.prompt_learner.class_token_ids[class_indices]  # (B*N, L)
            # CLIP's tokenizer places the EOT token with the highest id in the sequence,
            # so argmax returns its position (same trick used in encode_text)
            eot_positions = token_ids.argmax(dim=-1)  # (B*N,)
            # Adjust for prepended tokens: n_ctx (context tokens) + 1 (conditional token)
            offset = self.prompt_learner.n_ctx + 1
            eot_positions = eot_positions + offset
            # Gather the transformer output at the adjusted EOT positions
            x = x[torch.arange(x.shape[0], device=x.device), eot_positions.to(x.device)]  # (B*N, 512)
            
            # Layer norm finale
            x = self.clip_model.ln_final(x)
            
            # Projection
            text_feat = self.clip_model.text_projection @ x.T  # (512, B*N)
            text_feat = text_feat.T  # (B*N, 512)
            
            # Reshape back to (B, N, 512)
            text_feat = text_feat.view(B, N, -1)
            
            # Normalizza
            text_feat /= text_feat.norm(dim=-1, keepdim=True)
            
            # ===== LOGITS =====
            logit_scale = self.clip_model.logit_scale.exp()
            logits = logit_scale * (img_feat.unsqueeze(1) * text_feat).sum(-1)
            
            # ===== LOSS =====
            # Use precomputed mapping from original class id -> contiguous index
            labels_mapped = torch.tensor(
                [self.contig_cat2idx[l.item()] for l in labels]
            ).to(self.device)
            
            loss = F.cross_entropy(logits, labels_mapped)
            
            # ===== BACKWARD =====
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
            n_batches += 1
        
        avg_loss = total_loss / max(1, n_batches)
        return avg_loss
    
    @torch.no_grad()
    def eval(self, dataset, categories, batch_size=64):
        """
        Valuta il modello su un dataset.
        """
        self.prompt_learner.eval()
        self.clip_model.eval()
        
        contig_cat2idx = {cat: idx for idx, cat in enumerate(categories)}
        
        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=2
        )
        
        correct = 0
        total = 0
        
        for images, labels in tqdm(dataloader, desc="CoCoOp eval"):
            images = images.to(self.device)
            labels = labels.to(self.device)
            
            # Encode immagini
            img_feat = self.clip_model.encode_image(images)
            img_feat = img_feat.to(self.prompt_learner.meta_net.linear1.weight.dtype)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)
            
            # Genera prompts
            prompts = self.prompt_learner(img_feat)
            B, N, L, D = prompts.shape
            prompts_flat = prompts.view(B * N, L, D)
            
            # Encode testi (stessa procedura del training)
            x = prompts_flat
            x = x + self.clip_model.positional_embedding
            x = x.permute(1, 0, 2)
            x = self.clip_model.transformer(x)
            x = x.permute(1, 0, 2)
            # Use EOT positions to extract final token (consistent with encode_text)
            class_indices = torch.arange(N, device=self.prompt_learner.class_token_ids.device).unsqueeze(0).repeat(B, 1).view(-1)
            token_ids = self.prompt_learner.class_token_ids[class_indices]  # (B*N, L)
            eot_positions = token_ids.argmax(dim=-1)
            # Adjust for prepended tokens: n_ctx (context tokens) + 1 (conditional token)
            offset = self.prompt_learner.n_ctx + 1
            eot_positions = eot_positions + offset
            x = x[torch.arange(x.shape[0], device=x.device), eot_positions.to(x.device)]
            x = self.clip_model.ln_final(x)
            text_feat = (self.clip_model.text_projection @ x.T).T
            text_feat = text_feat.view(B, N, -1)
            text_feat /= text_feat.norm(dim=-1, keepdim=True)
            
            # Logits
            logit_scale = self.clip_model.logit_scale.exp()
            logits = logit_scale * (img_feat.unsqueeze(1) * text_feat).sum(-1)
            
            # Predizioni
            pred = logits.argmax(dim=1)
            labels_mapped = torch.tensor(
                [contig_cat2idx[l.item()] for l in labels]
            ).to(self.device)
            
            correct += (pred == labels_mapped).sum().item()
            total += labels.size(0)
        
        return correct / total

: 

## Training CoCoOp

We will train the PromptLearner for **10 epochs** on **base classes only**.

**Hyperparameters:**
- Learning rate: 0.002 (SGD)
- Momentum: 0.9
- Weight decay: 5e-4
- Batch size: 32
- Epochs: 10

**What happens:**
- Context vectors V1...VM adapt to the Flowers102 dataset
- MetaNetwork learns to generate useful conditional tokens
- CLIP remains frozen (unchanged)

**Expected output:**
- Initial loss: ~3.0
- Final loss: ~1.3-1.5
- Training time: ~5-10 minutes on GPU

In [None]:

# Preparing base class names for CoCoOp
base_classnames = [CLASS_NAMES[i] for i in base_classes]
print(f"Base classnames ({len(base_classnames)}): {base_classnames[:5]}...\n")

# Creating trainer for CoCoOp
trainer = CoCoOpTrainer(
    clip_model=model,
    base_classnames=base_classnames,
    base_classes=base_classes,
    novel_classes=novel_classes,
    device=device,
    lr=0.002
)

num_epochs = 10
for epoch in range(num_epochs):
    avg_loss = trainer.train_epoch(train_base, batch_size=32)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")



Base classnames (51): ['pink primrose', 'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea', 'english marigold']...

[CoCoOp] ctx_dim=512, max_len=77


CoCoOp training:   0%|          | 0/16 [00:00<?, ?it/s]

: 

: 

## Final Evaluation and Comparison

We evaluate the trained model on:
1. **Test Base:** Classes seen during training (51 classes)
2. **Test Novel:** Never-seen classes (51 new classes)

Then compare against the zero-shot baseline.

**Interpretation:**
- Base improves significantly (specialization on dataset)
- Novel decreases slightly (learning bias toward base)
- HM remains positive (good compromise)

If HM < 75%, we will add Knowledge Distillation in the next iteration.


In [None]:
print("\n" + "="*60)
print("üìä EVALUATION AND COMPARISON")
print("="*60)

# Valutazione su base e novel
base_acc_cocoop = trainer.eval(test_base, base_classes, batch_size=64)
novel_acc_cocoop = trainer.eval(test_novel, novel_classes, batch_size=64)
hm_cocoop = harmonic_mean(base_acc_cocoop, novel_acc_cocoop)

# Stampa risultati
print("\n" + "="*60)
print("üìà RESULTS COMPARISON")
print("="*60)

print("\nüîµ Zero-Shot CLIP (Baseline)")
print(f"   Base Accuracy:  {base_accuracy*100:6.2f}%")
print(f"   Novel Accuracy: {novel_accuracy*100:6.2f}%")
print(f"   Harmonic Mean:  {hm_zeroshot*100:6.2f}%")

print("\nüü¢ CoCoOp (Prompt Learning + MetaNetwork)")
print(f"   Base Accuracy:  {base_acc_cocoop*100:6.2f}%  (Œî {(base_acc_cocoop-base_accuracy)*100:+6.2f}%)")
print(f"   Novel Accuracy: {novel_acc_cocoop*100:6.2f}%  (Œî {(novel_acc_cocoop-novel_accuracy)*100:+6.2f}%)")
print(f"   Harmonic Mean:  {hm_cocoop*100:6.2f}%  (Œî {(hm_cocoop-hm_zeroshot)*100:+6.2f}%)")

print("\n" + "="*60)
