In [1]:
!git clone https://github.com/ezmi234/Affordance_Highlighting_Project_2024.git

Cloning into 'Affordance_Highlighting_Project_2024'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 48 (delta 14), reused 42 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (48/48), 1.81 MiB | 2.71 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [1]:
%cd Affordance_Highlighting_Project_2024/

/content/Affordance_Highlighting_Project_2024


In [3]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html

# Downgrade numpy to a compatible version
!pip install numpy==1.23.5 --force-reinstall

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-xshdagdi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-xshdagdi
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [2]:
import torch

# Show details
print(f"PyTorch version: {torch.__version__}, CUDA version: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.5.1+cu124, CUDA version: 12.4, GPU available: True


In [3]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch.nn as nn
import torchvision

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import color_mesh

Warp 1.7.0 initialized:
   CUDA Toolkit 12.8, Driver 12.4
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.7.0


In [4]:
# Show available CLIP models
clip_models = clip.available_models()
print("Available CLIP models:")
for m in clip_models:
    print(m)

Available CLIP models:
RN50
RN101
RN50x4
RN50x16
RN50x64
ViT-B/32
ViT-B/16
ViT-L/14
ViT-L/14@336px


In [5]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=256, output_dim=2, num_layers=6):
        """
        Args:
            input_dim: usually 3 (x, y, z)
            hidden_dim: size of hidden layers
            output_dim: 2 for [highlight, gray]
            num_layers: total number of linear layers
        """
        super(NeuralHighlighter, self).__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.LayerNorm(hidden_dim)]

        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(hidden_dim))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

def get_clip_model(clipmodel='ViT-L/14', jit=False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clipmodel, device=device, jit=jit)
    print(f"Loaded CLIP model: {clipmodel} on {device} (jit={jit})")
    return model, preprocess


# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(rendered_images, prompt, clip_model, aug_transform, n_augs, device, tokenizer):
    """
    Computes the CLIP loss as negative cosine similarity between
    rendered image embeddings and the text prompt embedding.

    Args:
        rendered_images (torch.Tensor): shape (B, 3, H, W)
        text_prompt (str): e.g., "a gray chair with highlighted seat"
        clip_model (torch.nn.Module): preloaded CLIP model
        device (str): "cuda" or "cpu"

    Returns:
        loss (torch.Tensor): scalar CLIP loss
    """
    # Encode text


    text_encoded = tokenizer([prompt]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_encoded)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

    loss = 0.0

    for _ in range(n_augs):
      aug_image = aug_transform(rendered_images)
      image_encoded = clip_model.encode_image(rendered_images)
      loss -= torch.mean(torch.cosine_similarity(image_encoded, text_features))

    return loss / n_augs

def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


In [6]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)

# ==== Set Seed for Determinism ====
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [7]:
# ==== Hyperparameters and Settings ====
render_res = 224
learning_rate = 0.0007
n_iter = 1800
obj_path = 'data/dog.obj'
n_augs = 3
output_dir = './output/'
clip_model_name = 'ViT-B/32'
prompt = 'A gray dog with highlighted mask'

In [8]:
# ==== Setup Output Directory ====
Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)
log_dir = output_dir

In [9]:
# ==== Load Mesh ====
objbase, extension = os.path.splitext(os.path.basename(obj_path))
render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# ==== CLIP ====
clip_model, preprocess = get_clip_model(clip_model_name)
tokenizer = clip.tokenize

clip_normalizer = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],std=[0.26862954, 0.26130258, 0.27577711]) #from https://github.com/openai/CLIP/issues/20

aug_transform = transforms.Compose([
        transforms.RandomResizedCrop(render_res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])




# ==== Neural Highlighter ====
mlp = NeuralHighlighter().to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# ==== Colors and Other Constants ====
colors = torch.tensor([[204/255, 1., 0.], [180/255, 180/255, 180/255]]).to(device)
background = torch.tensor((1., 1., 1.)).to(device)
vertices = copy.deepcopy(mesh.vertices).to(device)
n_views = 5
losses = []

100%|███████████████████████████████████████| 338M/338M [00:10<00:00, 32.5MiB/s]


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


In [10]:
# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(rendered_images, prompt, clip_model, aug_transform, n_augs, device, tokenizer)

    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")


# save results
save_final_results(log_dir, objbase, mesh, mlp, vertices, colors, render, background)

# ==== Save Prompt ====
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

  0%|          | 1/1900 [00:01<54:38,  1.73s/it]

Last 100 CLIP score: -0.258544921875


  5%|▌         | 102/1900 [00:15<03:35,  8.33it/s]

Last 100 CLIP score: -0.250947265625


 11%|█         | 202/1900 [00:27<03:22,  8.38it/s]

Last 100 CLIP score: -0.25961669921875


 16%|█▌        | 302/1900 [00:41<03:14,  8.20it/s]

Last 100 CLIP score: -0.262685546875


 21%|██        | 402/1900 [00:53<03:05,  8.07it/s]

Last 100 CLIP score: -0.264002685546875


 26%|██▋       | 502/1900 [01:06<02:50,  8.21it/s]

Last 100 CLIP score: -0.261229248046875


 32%|███▏      | 602/1900 [01:18<02:40,  8.09it/s]

Last 100 CLIP score: -0.2643310546875


 37%|███▋      | 702/1900 [01:31<02:24,  8.30it/s]

Last 100 CLIP score: -0.26310546875


 42%|████▏     | 802/1900 [01:43<02:15,  8.09it/s]

Last 100 CLIP score: -0.263486328125


 47%|████▋     | 902/1900 [01:56<02:04,  8.03it/s]

Last 100 CLIP score: -0.2628759765625


 53%|█████▎    | 1002/1900 [02:09<01:50,  8.13it/s]

Last 100 CLIP score: -0.265157470703125


 58%|█████▊    | 1102/1900 [02:21<01:35,  8.33it/s]

Last 100 CLIP score: -0.26439208984375


 63%|██████▎   | 1202/1900 [02:36<01:54,  6.11it/s]

Last 100 CLIP score: -0.26198974609375


 69%|██████▊   | 1302/1900 [02:49<01:18,  7.62it/s]

Last 100 CLIP score: -0.2613623046875


 74%|███████▍  | 1402/1900 [03:02<01:02,  7.97it/s]

Last 100 CLIP score: -0.26208251953125


 79%|███████▉  | 1502/1900 [03:15<00:49,  7.96it/s]

Last 100 CLIP score: -0.261824951171875


 84%|████████▍ | 1602/1900 [03:28<00:36,  8.23it/s]

Last 100 CLIP score: -0.26314453125


 90%|████████▉ | 1702/1900 [03:41<00:23,  8.39it/s]

Last 100 CLIP score: -0.261376953125


 95%|█████████▍| 1802/1900 [03:54<00:11,  8.28it/s]

Last 100 CLIP score: -0.259896240234375


100%|██████████| 1900/1900 [04:06<00:00,  7.71it/s]


In [12]:
%rm -rf output/renders/