# CLIP with Flowers!?!?!??!?

In [1]:
import os
# Prefer expandable segments to reduce fragmentation (restart kernel after changing)
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')

import sys
import torch
import torchvision

# Ensure CLIP is installed in the current kernel; install if missing.
# Using subprocess with sys.executable to target the same Python interpreter.
try:
    import clip
except Exception:
    import subprocess, importlib
    try:
        get_ipython().run_line_magic('pip', 'install --upgrade git+https://github.com/openai/CLIP.git')
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "git+https://github.com/openai/CLIP.git"], stdout=subprocess.DEVNULL)
    importlib.invalidate_caches()
    import clip

from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import autocast, GradScaler
from collections import OrderedDict


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-mmq7qedt
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-mmq7qedt
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=2adad28e5ede7f9c823eaae794b83d61f1d6de886ebf2c3cd912d63dda457da2
  Stored in directory: /tmp/pip-ephem-wheel-cache-xs0hx9_c/wheels/35/3e/df/3d24cbfb3b6a06f17

In [2]:
# Check environment and CLIP installation
import sys, importlib, torch

try:
    import clip
    print("CLIP: già installato")
except Exception:
    print("CLIP non trovato: eseguo installazione nel kernel corrente...")
    import importlib
    # Preferisci %pip per installare nel kernel Jupyter corrente; fallback a subprocess
    try:
        get_ipython().run_line_magic('pip', 'install --upgrade git+https://github.com/openai/CLIP.git')
    except Exception:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "git+https://github.com/openai/CLIP.git"])
    importlib.invalidate_caches()
    import clip
    print("CLIP installato correttamente nel kernel corrente")

print("python:", sys.executable)
print("torch:", torch.__version__, "cuda_available:", torch.cuda.is_available())
# su mac con Apple Silicon, controlla MPS
try:
    print("mps_available:", torch.backends.mps.is_available())
except Exception:
    pass

print('Riavvia il kernel se necessario, poi esegui questa cella e procedi con l\'allenamento CoCoOp.')


CLIP: già installato
python: /usr/bin/python3
torch: 2.9.0+cu126 cuda_available: True
mps_available: False
Riavvia il kernel se necessario, poi esegui questa cella e procedi con l'allenamento CoCoOp.


## Dataset Functions

We define utility functions for:
- **`get_data()`**: Load Flowers102 from torchvision
- **`base_novel_categories()`**: Split 102 classes into base (0-50) and novel (51-101)
- **`split_data()`**: Filter images for base/novel in each split

This simulates the real scenario: we have 51 seen classes during training (base) and 51 new ones (novel).


In [3]:
def get_data(data_dir="./data", transform=None):
    """Load Flowers102 train, validation and test sets."""
    train = torchvision.datasets.Flowers102(root=data_dir, split="train", download=True, transform=transform)
    val = torchvision.datasets.Flowers102(root=data_dir, split="val", download=True, transform=transform)
    test = torchvision.datasets.Flowers102(root=data_dir, split="test", download=True, transform=transform)
    return train, val, test


def base_novel_categories(dataset):
    """Return base and novel class id lists using the actual labels present
    in the dataset. Prefer public attributes (`targets` then `labels`) and
    only fall back to the dataset private attribute `_labels` if neither is
    available.
    """
    labels = getattr(dataset, "targets", None)
    if labels is None:
        labels = getattr(dataset, "labels", None)

    if labels is None and hasattr(dataset, "_labels"):
        # FALLBACK: using private dataset internals. Flowers102 exposes
        # `_labels` but this is a private attribute; prefer public attributes
        # above so future datasets remain compatible.
        labels = dataset._labels

    if labels is None:
        raise ValueError("Could not find labels on dataset (checked 'targets','labels','_labels').")

    unique_labels = sorted(set(labels))
    num_classes = len(unique_labels)
    mid = num_classes // 2
    base_classes = unique_labels[:mid]
    novel_classes = unique_labels[mid:]
    return base_classes, novel_classes


def split_data(dataset, base_classes):
    base_categories_samples = []
    novel_categories_samples = []
    base_set = set(base_classes)

    for sample_id, label in enumerate(dataset._labels):
        if label in base_set:
            base_categories_samples.append(sample_id)
        else:
            novel_categories_samples.append(sample_id)

    base_dataset = torch.utils.data.Subset(dataset, base_categories_samples)
    novel_dataset = torch.utils.data.Subset(dataset, novel_categories_samples)
    return base_dataset, novel_dataset

## Class Names and Dataset Loading

We load the names of 102 flower classes from Flowers102.

This is **critical** for CLIP:
- Creates prompts like "a photo of a **rose**, a type of flower"
- Each prompt is encoded by CLIP's text encoder
- Image features are compared against these text templates


In [4]:
_, _, tmp_test = get_data()
base_classes, novel_classes = base_novel_categories(tmp_test)

CLASS_NAMES = ["pink primrose", "hard-leaved pocket orchid", "canterbury bells", "sweet pea", "english marigold", "tiger lily", "moon orchid", "bird of paradise", "monkshood", "globe thistle", "snapdragon", "colt's foot", "king protea", "spear thistle", "yellow iris", "globe-flower", "purple coneflower", "peruvian lily", "balloon flower", "giant white arum lily", "fire lily", "pincushion flower", "fritillary", "red ginger", "grape hyacinth", "corn poppy", "prince of wales feathers", "stemless gentian", "artichoke", "sweet william", "carnation", "garden phlox", "love in the mist", "mexican aster", "alpine sea holly", "ruby-lipped cattleya", "cape flower", "great masterwort", "siam tulip", "lenten rose", "barbeton daisy", "daffodil", "sword lily", "poinsettia", "bolero deep blue", "wallflower", "marigold", "buttercup", "oxeye daisy", "common dandelion", "petunia", "wild pansy", "primula", "sunflower", "pelargonium", "bishop of llandaff", "gaura", "geranium", "orange dahlia", "pink-yellow dahlia?", "cautleya spicata", "japanese anemone", "black-eyed susan", "silverbush", "californian poppy", "osteospermum", "spring crocus", "bearded iris", "windflower", "tree poppy", "gazania", "azalea", "water lily", "rose", "thorn apple", "morning glory", "passion flower", "lotus", "toad lily", "anthurium", "frangipani", "clematis", "hibiscus", "columbine", "desert-rose", "tree mallow", "magnolia", "cyclamen", "watercress", "canna lily", "hippeastrum", "bee balm", "ball moss", "foxglove", "bougainvillea", "camellia", "mallow", "mexican petunia", "bromelia", "blanket flower", "trumpet creeper", "blackberry lily"]

# Uncomment to see class names
# print("Base Class Names:", [(i, CLASS_NAMES[i]) for i in base_classes])
# print("Novel Class Names:", [(i, CLASS_NAMES[i]) for i in novel_classes])

100%|██████████| 345M/345M [00:09<00:00, 34.8MB/s]
100%|██████████| 502/502 [00:00<00:00, 1.70MB/s]
100%|██████████| 15.0k/15.0k [00:00<00:00, 5.74MB/s]


In [5]:
# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device=device)
print(f"Device: {device}")
print(f"Model: ViT-B/16")

100%|████████████████████████████████████████| 335M/335M [00:03<00:00, 113MiB/s]


Device: cuda
Model: ViT-B/16


## Load Flowers102 and Split Base/Novel

We load the 3 splits (train, val, test) and divide into base/novel.

**Statistics:**
- Train Base: 10 images × 51 classes = 510 images
- Val Base: 10 images × 51 classes = 510 images
- Test Base: ~10 images × 51 classes (from test split)
- Test Novel: Remaining (~10 per class)

**Note:** Train and val have ~10 images per class (few-shot setting).


In [6]:
# get the three datasets
train_set, val_set, test_set = get_data(transform=preprocess)

# split classes into base and novel
base_classes, novel_classes = base_novel_categories(train_set)

# Few-shot: sample `shots_per_class` images per base class from the train split
shots_per_class = 16
import random
random.seed(42)

# Collect indices per class in the original train_set
indices_per_class = {c: [] for c in base_classes}
for idx, label in enumerate(train_set._labels):
    if label in indices_per_class:
        indices_per_class[label].append(idx)

selected = []
for c in base_classes:
    inds = indices_per_class.get(c, [])
    random.shuffle(inds)
    # take up to shots_per_class (if fewer available, take all)
    selected.extend(inds[:shots_per_class])

# Create the few-shot training subset
train_base = torch.utils.data.Subset(train_set, selected)

# validation and test splits remain full (or filtered by base classes)
val_base, _ = split_data(val_set, base_classes)
test_base, test_novel = split_data(test_set, base_classes)

print(f"Train Base (few-shot): {len(train_base)} samples ({shots_per_class} shots per class)")
print(f"Val Base: {len(val_base)} samples")
print(f"Test Base: {len(test_base)} samples")
print(f"Test Novel: {len(test_novel)} samples")

Train Base (few-shot): 510 samples (16 shots per class)
Val Base: 510 samples
Test Base: 2473 samples
Test Novel: 3676 samples


## Harmonic Mean (HM)

Standard metric for few-shot adaptation papers.

Formula: HM = 2 / (1/base_acc + 1/novel_acc)

**Why HM instead of arithmetic mean?**
- HM heavily penalizes outliers
- If base=90% and novel=50%: arithmetic mean=70%, HM=64.3%
- Forces the model to balance both accuracies

**Obiettivo:** massimizzare l'HM tra `base_acc_cocoop` e `novel_acc_cocoop`.


In [7]:
def harmonic_mean(base_accuracy, novel_accuracy):
    # Guard against zero to avoid division-by-zero errors
    if base_accuracy <= 0 or novel_accuracy <= 0:
        return 0.0
    numerator = 2.0
    denominator = 1.0 / base_accuracy + 1.0 / novel_accuracy
    return numerator / denominator


## Text Encoder

In [24]:
class TextEncoder(nn.Module):
    """Encodes soft prompts through CLIP's text transformer."""

    def __init__(self, clip_model):
        super().__init__()
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype

    def forward(self, prompts, tokenized_prompts):
        """
        Args:
            prompts: (batch_size, n_tokens, 512) soft prompt embeddings
            tokenized_prompts: (n_cls, n_tokens) token indices for EOT detection

        Returns:
            text_features: (batch_size, 512) per-class text features
        """
        x = prompts + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)

        # Extract EOT token
        x = x[torch.arange(int(x.shape[0])), tokenized_prompts.argmax(dim=-1)] @ self.text_projection

        return x

## MetaNetwork: Conditional Token Generator

**Problem:** Fixed prompts don't adapt to each image.

**Solution:** A small neural network that transforms image features into a conditional token.

**Parameters:** ~256K (negligible vs. fine-tuning)

**Effect:** Each image gets a different prompt → instance-level adaptation


In [25]:
"""
MetaNetwork è una piccola rete neurale (MLP con 2 layer)
che trasforma le image_features (512-dim) in un token
condizionale (512-dim) usato in CoCoOp.

Questo token varia per ogni immagine, permettendo prompt
personalizzati per ogni input.
"""
\
class MetaNetwork(nn.Module):
    def __init__(self, ctx_dim=512, hidden_dim=256):
        """
        Args:
            ctx_dim: dimensione degli embeddings (512 per ViT-B/16)
            hidden_dim: dimensione dello strato nascosto
        """
        super().__init__()
        self.linear1 = nn.Linear(ctx_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(hidden_dim, ctx_dim)

    def forward(self, image_features):
        """
        Args:
            image_features: tensor (B, ctx_dim) dalle immagini encodate

        Returns:
            conditional_token: tensor (B, ctx_dim)
        """
        # Assicura il tipo corretto (importante per mixed precision)
        image_features = image_features.to(self.linear1.weight.dtype)

        out = self.linear1(image_features)
        out = self.relu(out)
        out = self.linear2(out)
        return out


## CoCoOpPromptLearner: Dynamic Prompts


**Components:**
1. **V1...VM:** 16 context vectors (learned via SGD)
   - Shape: (16, 512) tensors
   - Initialized randomly from N(0, 0.02²)
   - Optimized during training

2. **π(x):** Conditional token (generated per image)
   - Shape: (B, 512) from MetaNetwork output
   - Different for each image

3. **[CLASS]:** Class name embedding
   - Shape: (seq_len, 512) from CLIP's token embedding
   - Same for all images of the same class

**Forward Pass:**
- Input: image_features (B, 512)
- Output: prompts (B, num_classes, seq_len_total, 512)


## CoCoOpTrainer: Training and Evaluation

Class that manages:

**1. Initialization:**
- Create PromptLearner
- Freeze CLIP (`requires_grad=False`)
- Configure SGD optimizer for prompt learner only

**2. train_epoch():**
- Forward: Image encoder + PromptLearner + Text encoder
- **Critical step:** Encode soft prompts through text transformer
  - Add positional embeddings
  - Pass through CLIP's transformer
  - Extract first token
  - Apply final layer norm + projection
- Compute loss: Cross-entropy on base classes
- Backward: Backprop only in PromptLearner
- Return: Average loss of the epoch

**3. eval():**
- Same forward procedure as training
- Without backward pass
- Compute accuracy on any dataset (base or novel)

**Important note:** We don't use `model.encode_text()` on soft prompts
because that method expects integer tokens, not embeddings.
We manually forward through the text transformer.

In [26]:
class CoCoOpPromptLearner(nn.Module):
    def __init__(self, clip_model, classnames, n_ctx=4, ctx_init=None, device='cuda'):
        super().__init__()

        n_cls = len(classnames)
        ctx_dim = clip_model.ln_final.weight.shape[0]
        vis_dim = clip_model.visual.output_dim

        # ✅ CONTEXT VECTORS
        if ctx_init:
            ctx_init = ctx_init.replace("_", " ")
            n_ctx = len(ctx_init.split(" "))
            prompt = clip.tokenize(ctx_init)
            with torch.no_grad():
                embedding = clip_model.token_embedding(prompt).type(torch.float32)
            ctx_vectors = embedding[0, 1:1+n_ctx, :]
            prompt_prefix = ctx_init
        else:
            ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=torch.float32)
            nn.init.normal_(ctx_vectors, std=0.02)
            prompt_prefix = " ".join(["X"] * n_ctx)

        print(f'Initial context: "{prompt_prefix}"')
        print(f"Number of context words: {n_ctx}")

        self.ctx = nn.Parameter(ctx_vectors)

        # ✅ META NETWORK (512 → 32 → 512)
        self.meta_net = nn.Sequential(OrderedDict([
            ("linear1", nn.Linear(vis_dim, vis_dim // 16)),
            ("relu", nn.ReLU(inplace=True)),
            ("linear2", nn.Linear(vis_dim // 16, ctx_dim))
        ]))

        # ✅ CLASS EMBEDDINGS
        classnames = [name.replace("_", " ") for name in classnames]
        prompts = [prompt_prefix + " " + name + "." for name in classnames]

        tokenized_prompts = torch.cat([clip.tokenize(p) for p in prompts])
        tokenized_prompts = tokenized_prompts.to(device)  # ✅ AGGIUNGI QUESTA LINEA


        with torch.no_grad():
            embedding = clip_model.token_embedding(tokenized_prompts).type(torch.float32)

        self.register_buffer("token_prefix", embedding[:, :1, :])
        self.register_buffer("token_suffix", embedding[:, 1+n_ctx:, :])

        self.n_cls = n_cls
        self.n_ctx = n_ctx
        self.tokenized_prompts = tokenized_prompts  # ✅ STORE THIS

    def construct_prompts(self, ctx, prefix, suffix, label=None):
        if label is not None:
            prefix = prefix[label]
            suffix = suffix[label]

        prompts = torch.cat([prefix, ctx, suffix], dim=1)
        return prompts

    def forward(self, im_features):
        prefix = self.token_prefix
        suffix = self.token_suffix

        ctx = self.ctx.unsqueeze(0)
        bias = self.meta_net(im_features)
        bias = bias.unsqueeze(1)
        ctx_shifted = ctx + bias  # ✅ FULL SCALE

        prompts = []
        for ctx_shifted_i in ctx_shifted:
            ctx_i = ctx_shifted_i.unsqueeze(0).expand(self.n_cls, -1, -1)
            pts_i = self.construct_prompts(ctx_i, prefix, suffix)
            prompts.append(pts_i)

        prompts = torch.stack(prompts)
        return prompts

In [27]:
class CustomCLIP(nn.Module):
    """CLIP model with learnable prompts."""

    def __init__(self, clip_model, classnames, n_ctx=4, ctx_init=None, device='cuda'):
        super().__init__()

        self.prompt_learner = CoCoOpPromptLearner(clip_model, classnames,
                                   n_ctx=n_ctx,
                                   device=device)


        self.tokenized_prompts = self.prompt_learner.tokenized_prompts
        self.image_encoder = clip_model.visual
        self.text_encoder = TextEncoder(clip_model)
        self.logit_scale = clip_model.logit_scale
        self.dtype = clip_model.dtype

    def forward(self, image, label=None):
        """
        Args:
            image: (batch_size, 3, 224, 224)
            label: (batch_size,) class labels during training

        Returns:
            logits: (batch_size, n_cls) or loss if label provided
        """
        logit_scale = self.logit_scale.exp()

        # ✅ Encode images
        image_features = self.image_encoder(image.type(self.dtype))
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # ✅ Generate instance-conditioned prompts
        prompts = self.prompt_learner(image_features)  # (batch, n_cls, n_tokens, dim)

        # ✅ Encode prompts and compute logits
        batch_size = int(prompts.shape[0])
        logits = []

        for i in range(batch_size):
            # Prompts for this image: (n_cls, n_tokens, dim)
            pts_i = prompts[i]
            # Image feature for this image: (dim,)
            imf_i = image_features[i]

            # Encode prompts through text transformer
            text_features = self.text_encoder(pts_i, self.tokenized_prompts)
            # text_features: (n_cls, dim)

            # ✅ Normalize and compute similarity
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
            l_i = logit_scale * (imf_i @ text_features.t())  # (n_cls,)
            logits.append(l_i)

        logits = torch.stack(logits)  # (batch_size, n_cls)

        # Return loss during training, logits during evaluation
        if label is not None:
            return F.cross_entropy(logits, label)

        return logits


## Training CoCoOp

We will train the PromptLearner for **5 epochs** on **base classes only**.

**Hyperparameters:**
- Learning rate: 0.002 (SGD)
- Momentum: 0.9
- Weight decay: 5e-4
- Batch size: 1
- Epochs: 5

**What happens:**
- Context vectors V1...VM adapt to the Flowers102 dataset
- MetaNetwork learns to generate useful conditional tokens
- CLIP remains frozen (unchanged)

**Expected output:**
- Initial loss: ~3.0
- Final loss: ~1.3-1.5
- Training time: ~5-10 minutes on GPU

In [28]:
def harmonic_mean(base_acc, novel_acc):
    """Compute harmonic mean of accuracies."""
    if base_acc <= 0 or novel_acc <= 0:
        return 0.0
    return 2.0 / (1.0 / base_acc + 1.0 / novel_acc)


# Setup
base_classnames = [CLASS_NAMES[i] for i in base_classes]
novel_classnames = [CLASS_NAMES[i] for i in novel_classes]

# Initialize trainer
trainer = CoCoOpTrainer(
    clip_model=model,
    classnames=base_classnames,
    base_classes=base_classes,
    novel_classes=novel_classes,
    device=device,
    lr=0.002,  # ✅ CORRECTED (was 0.02)
    n_ctx=4,
    num_epochs=10
)

# Train
print("\n" + "="*70)
print("TRAINING CoCoOp")
print("="*70)

num_epochs = 10
for epoch in range(num_epochs):
    avg_loss = trainer.train_epoch(train_base, batch_size=4)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)

Initial context: "X X X X"
Number of context words: 4

CoCoOpTrainer initialized:
  LR: 0.002
  Momentum: 0.9
  Weight decay: 5e-4
  Scheduler: CosineAnnealing (T_max=10)
  Trainable params: 35,360


TRAINING CoCoOp


Training:   1%|          | 1/128 [00:01<03:28,  1.64s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 11.74986744
  meta_net.linear1.weight                  | grad_norm: 10.27029991
  meta_net.linear1.bias                    | grad_norm: 10.23728561
  meta_net.linear2.weight                  | grad_norm: 3.38785386
  meta_net.linear2.bias                    | grad_norm: 22.18164253



Training: 100%|██████████| 128/128 [02:24<00:00,  1.13s/it]


Epoch 1/10 - Loss: 1.2628


Training:   1%|          | 1/128 [00:01<02:22,  1.12s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 0.25105047
  meta_net.linear1.weight                  | grad_norm: 0.16530927
  meta_net.linear1.bias                    | grad_norm: 0.17081828
  meta_net.linear2.weight                  | grad_norm: 0.24921629
  meta_net.linear2.bias                    | grad_norm: 0.45846319



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 2/10 - Loss: 0.8666


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.52513909
  meta_net.linear1.weight                  | grad_norm: 0.72689837
  meta_net.linear1.bias                    | grad_norm: 0.77104610
  meta_net.linear2.weight                  | grad_norm: 1.81277370
  meta_net.linear2.bias                    | grad_norm: 2.82308865



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 3/10 - Loss: 0.6887


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 2.03115726
  meta_net.linear1.weight                  | grad_norm: 1.26247370
  meta_net.linear1.bias                    | grad_norm: 1.28350341
  meta_net.linear2.weight                  | grad_norm: 2.37043357
  meta_net.linear2.bias                    | grad_norm: 3.72208595



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 4/10 - Loss: 0.5825


Training:   1%|          | 1/128 [00:01<02:21,  1.11s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.52476120
  meta_net.linear1.weight                  | grad_norm: 0.62208641
  meta_net.linear1.bias                    | grad_norm: 0.61128104
  meta_net.linear2.weight                  | grad_norm: 1.89834023
  meta_net.linear2.bias                    | grad_norm: 2.76511836



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 5/10 - Loss: 0.4636


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 0.22821987
  meta_net.linear1.weight                  | grad_norm: 0.09319240
  meta_net.linear1.bias                    | grad_norm: 0.09196008
  meta_net.linear2.weight                  | grad_norm: 0.28984636
  meta_net.linear2.bias                    | grad_norm: 0.41590133



Training: 100%|██████████| 128/128 [02:24<00:00,  1.13s/it]


Epoch 6/10 - Loss: 0.4037


Training:   1%|          | 1/128 [00:01<02:19,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.86963296
  meta_net.linear1.weight                  | grad_norm: 1.18808711
  meta_net.linear1.bias                    | grad_norm: 1.21815026
  meta_net.linear2.weight                  | grad_norm: 2.30183983
  meta_net.linear2.bias                    | grad_norm: 3.33961344



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 7/10 - Loss: 0.3710


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.73204231
  meta_net.linear1.weight                  | grad_norm: 0.48262966
  meta_net.linear1.bias                    | grad_norm: 0.47898269
  meta_net.linear2.weight                  | grad_norm: 2.20052743
  meta_net.linear2.bias                    | grad_norm: 3.10994554



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 8/10 - Loss: 0.3319


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.83726847
  meta_net.linear1.weight                  | grad_norm: 0.86456114
  meta_net.linear1.bias                    | grad_norm: 0.89526719
  meta_net.linear2.weight                  | grad_norm: 2.28078914
  meta_net.linear2.bias                    | grad_norm: 3.24794793



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]


Epoch 9/10 - Loss: 0.3141


Training:   1%|          | 1/128 [00:01<02:20,  1.10s/it]


GRADIENT CHECK (first batch):
  ctx                                      | grad_norm: 1.65531600
  meta_net.linear1.weight                  | grad_norm: 0.72344887
  meta_net.linear1.bias                    | grad_norm: 0.72058392
  meta_net.linear2.weight                  | grad_norm: 2.13835073
  meta_net.linear2.bias                    | grad_norm: 3.06891727



Training: 100%|██████████| 128/128 [02:23<00:00,  1.12s/it]

Epoch 10/10 - Loss: 0.2992

TRAINING COMPLETED





## Final Evaluation (CoCoOp only)

We'll evaluate the model with:
1. Test Base
2. Test Novel

Computing Harmonic Mean between them to evaluate the trade-off.


In [29]:
print("\n" + "="*70)
print("EVALUATION")
print("="*70)

# ✅ CORRECTED eval() call (no 'classnames' parameter)
base_acc = trainer.eval(test_base, base_classes, batch_size=64)
novel_acc = trainer.eval(test_novel, novel_classes, batch_size=64)
hm = harmonic_mean(base_acc, novel_acc)

print("\n" + "="*70)
print("RESULTS")
print("="*70)
print(f"  Base Accuracy:  {base_acc*100:6.2f}%")
print(f"  Novel Accuracy: {novel_acc*100:6.2f}%")
print(f"  Harmonic Mean:  {hm*100:6.2f}%")
print("="*70)


EVALUATION


Evaluating: 100%|██████████| 39/39 [05:38<00:00,  8.68s/it]
Evaluating: 100%|██████████| 58/58 [08:22<00:00,  8.66s/it]


RESULTS
  Base Accuracy:   87.59%
  Novel Accuracy:   4.33%
  Harmonic Mean:    8.24%



