In [None]:
!git clone https://github.com/ezmi234/Affordance_Highlighting_Project_2024.git

Cloning into 'Affordance_Highlighting_Project_2024'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 27 (delta 0), reused 27 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 1.81 MiB | 4.05 MiB/s, done.


In [None]:
%cd Affordance_Highlighting_Project_2024

/content/Affordance_Highlighting_Project_2024


In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install optuna

# Downgrade numpy to a compatible version
!pip install numpy==1.23.5 --force-reinstall

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-u_zfy2n9
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-u_zfy2n9
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [None]:
import torch

# Show details
print(f"PyTorch version: {torch.__version__}, CUDA version: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.6.0+cu124, CUDA version: 12.4, GPU available: True


In [None]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch.nn as nn
import torchvision
from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import color_mesh
import optuna
import time

Warp 1.7.0 initialized:
   CUDA Toolkit 12.8, Driver 12.4
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.7.0
Mounted at /content/drive


In [None]:
clip_models = clip.available_models()
print("Available CLIP models:")
for m in clip_models:
    print(m)

Available CLIP models:
RN50
RN101
RN50x4
RN50x16
RN50x64
ViT-B/32
ViT-B/16
ViT-L/14
ViT-L/14@336px


In [None]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=256, output_dim=2, num_layers=6):
        """
        Args:
            input_dim: usually 3 (x, y, z)
            hidden_dim: size of hidden layers
            output_dim: 2 for [highlight, gray]
            num_layers: total number of linear layers
        """
        super(NeuralHighlighter, self).__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.LayerNorm(hidden_dim)]

        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(hidden_dim))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))  # 2-class output

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


def get_clip_model(clipmodel='ViT-L/14', jit=False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clipmodel, device=device, jit=jit)
    print(f"Loaded CLIP model: {clipmodel} on {device} (jit={jit})")
    return model, preprocess


# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')

def clip_loss(rendered_images, text_prompt, clip_transform, clip_model, tokenizer, device, aug_transform=None, n_augs=0):
    """
    """

    loss = 0.0

    # Encode text
    text_tokens = tokenizer([text_prompt]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_tokens).float()
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)  # L2 norm

    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        image_features = clip_model.encode_image(clip_image).float()
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # Cosine similarity
        loss = -torch.mean(torch.cosine_similarity(image_features, text_features))

    else:
        for _ in range(n_augs):
          aug_image = aug_transform(rendered_images)
          image_encoded = clip_model.encode_image(aug_image)
          loss -= torch.mean(torch.cosine_similarity(image_encoded, text_features))

        loss =  loss / n_augs

    return loss


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)

# ==== Set Seed for Determinism ====
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
# ==== Settings ====
render_res = 224
obj_path = 'data/dog.obj'
prompt = 'A gray dog with highlighted hat'

In [None]:
# ==== Load Mesh ====
objbase, extension = os.path.splitext(os.path.basename(obj_path))
render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# ==== Normalization and Augmentation ====
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

clip_transform = transforms.Compose([
    transforms.Resize((render_res, render_res)),
    clip_normalizer
])

augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(render_res, scale=(1, 1)),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    clip_normalizer
])

# ==== Colors and Other Constants ====
colors = torch.tensor([[204/255, 1., 0.], [180/255, 180/255, 180/255]]).to(device)
background = torch.tensor((1., 1., 1.)).to(device)
vertices = copy.deepcopy(mesh.vertices).to(device)
n_views = 5

In [None]:
import gc

In [None]:
def objective(trial):
    # Sample hyperparameters
    clip_model_name = trial.suggest_categorical("clip_model", ["ViT-B/32", "ViT-B/16", "ViT-L/14"])
    safe_model_name = clip_model_name.replace("/", "-")
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    network_depth = trial.suggest_int("depth", 3, 8)
    n_augs = trial.suggest_int("n_augs", 0, 5)
    n_views = trial.suggest_int("n_views", 2, 8)
    n_iters = trial.suggest_int("n_iters", 1800, 2500, step=100)

    # Timestamped export path to Google Drive
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"trial_{trial.number}_{safe_model_name}_lr{learning_rate:.1e}_d{network_depth}_v{n_views}_a{n_augs}_i{n_iters}"
    export_path = f"/content/drive/MyDrive/affordance_outputs/optuna_{timestamp}_{run_name}"
    os.makedirs(export_path+"/renders", exist_ok=True)

    # === Load components ===
    model, preprocess = get_clip_model(clip_model_name)
    tokenizer = clip.tokenize

    # Define MLP with trial's depth
    mlp = NeuralHighlighter(num_layers=network_depth).to(device)
    optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)

    # Setup everything else (mesh, render, etc.)...
    losses = []
    start_time = time.time()
    for i in range(n_iters):
        optimizer.zero_grad()
        pred_class = mlp(vertices)

        color_mesh(pred_class, mesh, colors)
        rendered_images, _, _ = render.render_views(
            mesh,
            num_views=n_views,
            show=False,
            center_azim=0,
            center_elev=0,
            std=1,
            return_views=True,
            lighting=True,
            background=background
        )

        loss = clip_loss(rendered_images, prompt, clip_transform, model, tokenizer, device, augment_transform, n_augs=n_augs)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        # Save intermediate results every 100 iterations
        if i % 100 == 0:
            save_renders(export_path, i, rendered_images)
            with open(os.path.join(export_path, "log.txt"), "a") as f:
                f.write(f"Iter {i} | Loss: {loss.item():.4f} | Last 100 avg CLIP score: {np.mean(losses[-100:]):.4f}\n")

    total_time = time.time() - start_time
    minutes, seconds = divmod(total_time, 60)

    del model  # if you create a model
    torch.cuda.empty_cache()
    gc.collect()

    # Save final output
    save_final_results(export_path, run_name, mesh, mlp, vertices, colors, render, background)
    with open(os.path.join(export_path, "final_score.txt"), "w") as f:
        f.write(f"Prompt: {prompt}\n")
        f.write(f"Final average CLIP loss: {sum(losses[-10:]) / 10:.4f}\n")
        f.write(f"Total time: {int(minutes)}m {int(seconds)}s\n")
        f.write(str(trial.params))

    return sum(losses[-10:]) / 10  # use average of last 10 iterations as final score

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # you can increase later

print("Best hyperparameters:", study.best_params)

[I 2025-04-28 07:57:17,168] A new study created in memory with name: no-name-a245b9e1-4f83-4a7c-bdc3-8f29fc4c783c
100%|███████████████████████████████████████| 890M/890M [00:17<00:00, 51.9MiB/s]


Loaded CLIP model: ViT-L/14 on cuda (jit=False)


[I 2025-04-28 08:18:32,979] Trial 0 finished with value: -0.21702663451433182 and parameters: {'clip_model': 'ViT-L/14', 'lr': 1.2685990672316477e-05, 'depth': 8, 'n_augs': 5, 'n_views': 3, 'n_iters': 1900}. Best is trial 0 with value: -0.21702663451433182.
100%|███████████████████████████████████████| 335M/335M [00:07<00:00, 45.4MiB/s]


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 08:26:48,361] Trial 1 finished with value: -0.2792400598526001 and parameters: {'clip_model': 'ViT-B/16', 'lr': 1.8833003102917162e-05, 'depth': 8, 'n_augs': 5, 'n_views': 2, 'n_iters': 2500}. Best is trial 1 with value: -0.2792400598526001.


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


[I 2025-04-28 08:30:04,119] Trial 2 finished with value: -0.2937143176794052 and parameters: {'clip_model': 'ViT-B/32', 'lr': 4.223542257667139e-05, 'depth': 8, 'n_augs': 0, 'n_views': 7, 'n_iters': 2500}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-L/14 on cuda (jit=False)


[I 2025-04-28 09:04:04,510] Trial 3 finished with value: -0.21607328802347184 and parameters: {'clip_model': 'ViT-L/14', 'lr': 0.00018151229222609444, 'depth': 4, 'n_augs': 5, 'n_views': 4, 'n_iters': 2500}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


[I 2025-04-28 09:09:37,971] Trial 4 finished with value: -0.2827012449502945 and parameters: {'clip_model': 'ViT-B/32', 'lr': 0.0009145851896952632, 'depth': 8, 'n_augs': 3, 'n_views': 8, 'n_iters': 2200}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


[I 2025-04-28 09:18:38,134] Trial 5 finished with value: -0.2931859791278839 and parameters: {'clip_model': 'ViT-B/32', 'lr': 2.5416600779074687e-05, 'depth': 4, 'n_augs': 5, 'n_views': 8, 'n_iters': 2500}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-L/14 on cuda (jit=False)


[I 2025-04-28 09:26:03,044] Trial 6 finished with value: -0.226605024933815 and parameters: {'clip_model': 'ViT-L/14', 'lr': 0.00013496661091302452, 'depth': 8, 'n_augs': 0, 'n_views': 5, 'n_iters': 1900}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-L/14 on cuda (jit=False)


[I 2025-04-28 09:56:25,237] Trial 7 finished with value: -0.21664746403694152 and parameters: {'clip_model': 'ViT-L/14', 'lr': 2.2734562940302435e-05, 'depth': 6, 'n_augs': 3, 'n_views': 8, 'n_iters': 1900}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


[I 2025-04-28 10:03:17,468] Trial 8 finished with value: -0.2668859541416168 and parameters: {'clip_model': 'ViT-B/32', 'lr': 0.00038541686325962884, 'depth': 6, 'n_augs': 5, 'n_views': 2, 'n_iters': 2200}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


[I 2025-04-28 10:09:38,842] Trial 9 finished with value: -0.28204200267791746 and parameters: {'clip_model': 'ViT-B/32', 'lr': 6.988736813494393e-05, 'depth': 6, 'n_augs': 4, 'n_views': 6, 'n_iters': 2200}. Best is trial 2 with value: -0.2937143176794052.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:13:15,082] Trial 10 finished with value: -0.30994645655155184 and parameters: {'clip_model': 'ViT-B/16', 'lr': 7.959625858448283e-05, 'depth': 3, 'n_augs': 0, 'n_views': 6, 'n_iters': 2300}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:17:04,187] Trial 11 finished with value: -0.303798121213913 and parameters: {'clip_model': 'ViT-B/16', 'lr': 5.5884882896024515e-05, 'depth': 5, 'n_augs': 0, 'n_views': 6, 'n_iters': 2400}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:20:44,507] Trial 12 finished with value: -0.28819615244865415 and parameters: {'clip_model': 'ViT-B/16', 'lr': 5.836226564750243e-05, 'depth': 3, 'n_augs': 1, 'n_views': 6, 'n_iters': 2300}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:24:24,076] Trial 13 finished with value: -0.29821066856384276 and parameters: {'clip_model': 'ViT-B/16', 'lr': 0.0001882546318954413, 'depth': 3, 'n_augs': 1, 'n_views': 6, 'n_iters': 2300}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:27:36,064] Trial 14 finished with value: -0.29074622690677643 and parameters: {'clip_model': 'ViT-B/16', 'lr': 0.00010110539049890098, 'depth': 5, 'n_augs': 1, 'n_views': 5, 'n_iters': 2300}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:32:20,742] Trial 15 finished with value: -0.2905951261520386 and parameters: {'clip_model': 'ViT-B/16', 'lr': 3.63718925121909e-05, 'depth': 4, 'n_augs': 2, 'n_views': 4, 'n_iters': 2400}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)


[I 2025-04-28 10:36:05,512] Trial 16 finished with value: -0.29106174111366273 and parameters: {'clip_model': 'ViT-B/16', 'lr': 0.0003091444716482067, 'depth': 5, 'n_augs': 0, 'n_views': 7, 'n_iters': 2100}. Best is trial 10 with value: -0.30994645655155184.


Loaded CLIP model: ViT-B/16 on cuda (jit=False)
