In [None]:
!git clone https://github.com/ezmi234/Affordance_Highlighting_Project_2024.git

Cloning into 'Affordance_Highlighting_Project_2024'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 57 (delta 19), reused 49 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (57/57), 1.84 MiB | 13.05 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [None]:
!gdown --id 1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF --output full-shape.zip

Downloading...
From (original): https://drive.google.com/uc?id=1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF
From (redirected): https://drive.google.com/uc?id=1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF&confirm=t&uuid=b63714dc-15e6-46c4-a6d2-773309501326
To: /content/Affordance_Highlighting_Project_2024/full-shape.zip
100% 558M/558M [00:07<00:00, 75.5MB/s]


In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html

!pip install trimesh
!pip install open3d

# Downgrade numpy to a compatible version
!pip install numpy==1.23.5 --force-reinstall

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-vb1s2zxc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-vb1s2zxc
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [None]:
%cd Affordance_Highlighting_Project_2024/

/content/Affordance_Highlighting_Project_2024


In [None]:
import torch

# Show details
print(f"PyTorch version: {torch.__version__}, CUDA version: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.6.0+cu124, CUDA version: 12.4, GPU available: True


In [None]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh as mesh
import kaolin.ops.conversions as conversions
import trimesh
import numpy as np
import os
import random
import torch.nn as nn
import torchvision
import open3d as o3d

from itertools import permutations, product
from kaolin.ops.mesh import face_normals
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import color_mesh

Warp 1.7.0 initialized:
   CUDA Toolkit 12.8, Driver 12.4
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.7.0


In [None]:
!unzip full-shape.zip -d data/full-shape

Archive:  full-shape.zip
  inflating: data/full-shape/full_shape_train_data.pkl  
  inflating: data/full-shape/full_shape_val_data.pkl  


In [None]:
import pickle
def load_dataset(path):
    dataset = []
    with open(path, 'rb') as f:
        train_data = pickle.load(f)
        print("Loaded train_data")
        # print train_data
        for index,info in enumerate(train_data):

            temp_info = {}
            temp_info["shape_id"] = info["shape_id"]
            temp_info["semantic class"] = info["semantic class"]
            temp_info["affordance"] = info["affordance"]
            temp_info["data_info"] = info["full_shape"]
            dataset.append(temp_info)
    return dataset

In [None]:
# Show available CLIP models
clip_models = clip.available_models()
print("Available CLIP models:")
for m in clip_models:
    print(m)

Available CLIP models:
RN50
RN101
RN50x4
RN50x16
RN50x64
ViT-B/32
ViT-B/16
ViT-L/14
ViT-L/14@336px


In [None]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=256, output_dim=2, num_layers=6):
        """
        Args:
            input_dim: usually 3 (x, y, z)
            hidden_dim: size of hidden layers
            output_dim: 2 for [highlight, gray]
            num_layers: total number of linear layers
        """
        super(NeuralHighlighter, self).__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.LayerNorm(hidden_dim)]

        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(hidden_dim))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

def get_clip_model(clipmodel='ViT-L/14', jit=False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clipmodel, device=device, jit=jit)
    print(f"Loaded CLIP model: {clipmodel} on {device} (jit={jit})")
    return model, preprocess


# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(rendered_images, prompt, clip_model, aug_transform, n_augs, device, tokenizer):
    """
    Computes the CLIP loss as negative cosine similarity between
    rendered image embeddings and the text prompt embedding.

    Args:
        rendered_images (torch.Tensor): shape (B, 3, H, W)
        text_prompt (str): e.g., "a gray chair with highlighted seat"
        clip_model (torch.nn.Module): preloaded CLIP model
        device (str): "cuda" or "cpu"

    Returns:
        loss (torch.Tensor): scalar CLIP loss
    """
    # Encode text


    text_encoded = tokenizer([prompt]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_encoded)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

    loss = 0.0

    for _ in range(n_augs):
      aug_image = aug_transform(rendered_images)
      image_encoded = clip_model.encode_image(aug_image)
      loss -= torch.mean(torch.cosine_similarity(image_encoded, text_features))

    return loss / n_augs

def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)

# ==== Set Seed for Determinism ====
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
def pointcloud_to_voxel_mesh(points, resolution=64, threshold=0.5, export_path=None):
  min_coords, _ = points.min(dim=0)
  max_coords, _ = points.max(dim=0)
  scale = max_coords - min_coords
  points_norm = (points - min_coords) / scale

  voxel_grid = conversions.pointclouds_to_voxelgrids(points_norm.unsqueeze(0), resolution=resolution).to(device)
  verts_faces = conversions.voxelgrids_to_trianglemeshes(voxel_grid, iso_value=threshold)

  verts = verts_faces[0][0].cpu() / resolution
  faces = verts_faces[1][0].cpu()

  # Denormalize
  scale = scale.cpu()
  min_coords = min_coords.cpu()
  verts = verts * scale + min_coords

  if verts.numel() == 0 or faces.numel() == 0:
      raise ValueError("Empty mesh generated from voxel grid.")

  # Create mesh
  mesh = trimesh.Trimesh(vertices=verts.numpy(), faces=faces.numpy())

  # Smoothing and export
  mesh = trimesh.smoothing.filter_laplacian(
      mesh, lamb=0.2, iterations=8,
      implicit_time_integration=False,
      volume_constraint=True
  )

  if export_path:
    mesh.export(export_path)

  return mesh

In [None]:
def load_vertices(data):
  if type(data) == str:
    mesh = o3d.io.read_triangle_mesh(data)
    mesh.compute_vertex_normals()
    vertices = mesh.sample_points_uniformly(number_of_points=4096)
    return torch.tensor(np.asarray(vertices.points), dtype=torch.float32).to(device)
  else:
    return torch.tensor(data, dtype=torch.float32).to(device)

In [None]:
# ==== Hyperparameters and Settings ====
render_res = 224
learning_rate = 0.0007
n_iter = 1800
obj_path = 'data/dog.obj'
n_augs = 3
output_dir = './output/'
clip_model_name = 'ViT-B/32'
prompt = 'A gray chair with highlighted back'

In [None]:
# ==== Setup Output Directory ====
Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)
log_dir = output_dir

In [None]:
# ==== Load Mesh ====
objbase, extension = os.path.splitext(os.path.basename(obj_path))
render = Renderer(dim=(render_res, render_res))

# ==== CLIP ====
clip_model, preprocess = get_clip_model(clip_model_name)
tokenizer = clip.tokenize

clip_normalizer = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],std=[0.26862954, 0.26130258, 0.27577711]) #from https://github.com/openai/CLIP/issues/20

aug_transform = transforms.Compose([
        transforms.RandomResizedCrop(render_res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

# ==== Neural Highlighter ====
mlp = NeuralHighlighter().to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# ==== Colors and Other Constants ====
colors = torch.tensor([[204/255, 1., 0.], [180/255, 180/255, 180/255]]).to(device)
background = torch.tensor((1., 1., 1.)).to(device)
n_views = 5
losses = []

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 186MiB/s]


Loaded CLIP model: ViT-B/32 on cuda (jit=False)


In [None]:
# Load dataset
dataset = load_dataset("data/full-shape/full_shape_train_data.pkl")

Loaded train_data


In [None]:
#data = "data/dog.obj"
data = dataset[13500]["data_info"]["coordinate"]
objbase = 'table'
prompt = 'A gray table with highlighted legs'

In [None]:
#data = "data/dog.obj"
data = dataset[1852]["data_info"]["coordinate"]
objbase = 'vase'
prompt = 'A gray vase with highlighted border'

In [None]:
#data = "data/dog.obj"
data = dataset[3915]["data_info"]["coordinate"]
objbase = 'chair'
prompt = 'A gray chair with highlighted shoes'

In [None]:
data = "data/dog.obj"
#data = dataset[3915]["data_info"]["coordinate"]
objbase = 'chair'
prompt = 'A gray chair with highlighted shoes'

In [None]:
vertices = load_vertices(data)
temp_obj_path = "outputDemo.obj"

mesh = pointcloud_to_voxel_mesh(
    vertices,  # sampled point cloud from Open3D
    resolution=16,
    threshold=0.5,
    export_path=temp_obj_path
)

# === Load the voxel mesh from disk ===
sampled_mesh = Mesh(temp_obj_path)
MeshNormalizer(sampled_mesh)()
vertices = torch.tensor(sampled_mesh.vertices, dtype=torch.float32, device=device)

  vg = torch.sparse.FloatTensor(
  vertices = torch.tensor(sampled_mesh.vertices, dtype=torch.float32, device=device)


In [None]:
# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # === Predict highlight probabilities ===
    pred_class = mlp(vertices)

    # === Color mesh ===
    color_mesh(pred_class, sampled_mesh, colors)

    # === Render the mesh ===
    rendered_images, elev, azim = render.render_views(
        sampled_mesh,
        num_views=n_views,
        show=False,
        center_azim=0,
        center_elev=0,
        std=1,
        return_views=True,
        lighting=True,
        background=background
    )

    # === Compute CLIP loss ===
    loss = clip_loss(rendered_images, prompt, clip_model, aug_transform, n_augs, device, tokenizer)
    loss.backward(retain_graph=True)
    optim.step()

    # === Save and log results ===
    with torch.no_grad():
        losses.append(loss.item())

    if i % 100 == 0:
        print(f"Last 100 CLIP score: {np.mean(losses[-100:])}")
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

# Remove generated obj
os.remove(temp_obj_path)

# Final save
save_final_results(log_dir, objbase, sampled_mesh, mlp, vertices, colors, render, background)

# Save prompt
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

  0%|          | 1/1800 [00:02<1:21:14,  2.71s/it]

Last 100 CLIP score: -0.2230224609375


  6%|▌         | 102/1800 [00:16<03:29,  8.11it/s]

Last 100 CLIP score: -0.249603271484375


 11%|█         | 202/1800 [00:29<03:21,  7.93it/s]

Last 100 CLIP score: -0.250072021484375


 17%|█▋        | 302/1800 [00:42<03:10,  7.87it/s]

Last 100 CLIP score: -0.249422607421875


 22%|██▏       | 402/1800 [00:56<03:49,  6.10it/s]

Last 100 CLIP score: -0.25163330078125


 28%|██▊       | 502/1800 [01:10<02:55,  7.38it/s]

Last 100 CLIP score: -0.24892333984375


 33%|███▎      | 602/1800 [01:23<02:32,  7.83it/s]

Last 100 CLIP score: -0.24936767578125


 39%|███▉      | 702/1800 [01:36<02:16,  8.06it/s]

Last 100 CLIP score: -0.24926513671875


 45%|████▍     | 802/1800 [01:49<02:07,  7.81it/s]

Last 100 CLIP score: -0.249429931640625


 50%|█████     | 902/1800 [02:03<01:54,  7.83it/s]

Last 100 CLIP score: -0.2496875


 56%|█████▌    | 1002/1800 [02:16<01:42,  7.79it/s]

Last 100 CLIP score: -0.25026123046875


 61%|██████    | 1102/1800 [02:30<01:27,  7.95it/s]

Last 100 CLIP score: -0.250238037109375


 67%|██████▋   | 1202/1800 [02:43<01:29,  6.65it/s]

Last 100 CLIP score: -0.250736083984375


 72%|███████▏  | 1302/1800 [02:56<01:28,  5.62it/s]

Last 100 CLIP score: -0.250946044921875


 78%|███████▊  | 1402/1800 [03:10<00:54,  7.27it/s]

Last 100 CLIP score: -0.251490478515625


 83%|████████▎ | 1502/1800 [03:23<00:36,  8.06it/s]

Last 100 CLIP score: -0.250147705078125


 89%|████████▉ | 1602/1800 [03:36<00:25,  7.84it/s]

Last 100 CLIP score: -0.2498974609375


 95%|█████████▍| 1702/1800 [03:49<00:13,  7.44it/s]

Last 100 CLIP score: -0.248304443359375


100%|██████████| 1800/1800 [04:02<00:00,  7.41it/s]


In [None]:
%rm -rf output/

In [None]:
%rm -rf output/renders/