In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install optuna

# Downgrade numpy to a compatible version
!pip install numpy==1.23.5 --force-reinstall

In [None]:
!git clone https://github.com/ezmi234/Affordance_Highlighting_Project_2024.git

In [None]:
%cd Affordance_Highlighting_Project_2024
!git checkout part1-mesh-highlighter

In [None]:
import torch

# Show details
print(f"PyTorch version: {torch.__version__}, CUDA version: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import clip
import copy
import kaolin as kal
import numpy as np
import os
import random
import torch.nn as nn
import torchvision
from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')

from Normalization import MeshNormalizer
from mesh import Mesh
from render import Renderer
from torchvision import transforms
from utils import color_mesh
import optuna
import time
import gc

In [None]:
clip_models = clip.available_models()
print("Available CLIP models:")
for m in clip_models:
    print(m)

In [None]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=256, output_dim=2, num_layers=6):
        """
        Args:
            input_dim: usually 3 (x, y, z)
            hidden_dim: size of hidden layers
            output_dim: 2 for [highlight, gray]
            num_layers: total number of linear layers
        """
        super(NeuralHighlighter, self).__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.LayerNorm(hidden_dim)]

        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(hidden_dim))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))  # 2-class output

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


def get_clip_model(clipmodel='ViT-L/14', jit=False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clipmodel, device=device, jit=jit)
    print(f"Loaded CLIP model: {clipmodel} on {device} (jit={jit})")
    return model, preprocess


# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')

def clip_loss(rendered_images, text_prompt, clip_transform, clip_model, tokenizer, device, aug_transform=None, n_augs=0):
    """
    Computes the CLIP loss as negative cosine similarity between
    rendered image embeddings and the text prompt embedding.

    Args:
        rendered_images (torch.Tensor): shape (B, 3, H, W)
        text_prompt (str): e.g., "a gray chair with highlighted seat"
        clip_transform (torchvision.transforms): preprocessing for CLIP
        clip_model (torch.nn.Module): preloaded CLIP model
        tokenizer (callable): CLIP tokenizer
        device (str): "cuda" or "cpu"
        aug_transform (torchvision.transforms): augmentation for CLIP
        n_augs (int): number of augmentations to apply
    Returns:
        loss (torch.Tensor): scalar CLIP loss
    """

    loss = 0.0

    # Encode text
    text_tokens = tokenizer([text_prompt]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_tokens).float()
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)  # L2 norm

    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        image_features = clip_model.encode_image(clip_image).float()
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # Cosine similarity
        loss = -torch.mean(torch.cosine_similarity(image_features, text_features))

    else:
        for _ in range(n_augs):
          aug_image = aug_transform(rendered_images)
          image_encoded = clip_model.encode_image(aug_image)
          loss -= torch.mean(torch.cosine_similarity(image_encoded, text_features))

        loss =  loss / n_augs

    return loss


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))

In [None]:
# ==== Settings ====
render_res = 224
obj_path = 'data/dog.obj'
prompt = 'A gray dog with highlighted hat'

In [None]:
# ==== Load Mesh ====
objbase, extension = os.path.splitext(os.path.basename(obj_path))
render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# ==== Normalization and Augmentation ====
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

clip_transform = transforms.Compose([
    transforms.Resize((render_res, render_res)),
    clip_normalizer
])

augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(render_res, scale=(1, 1)),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    clip_normalizer
])

# ==== Colors and Other Constants ====
colors = torch.tensor([[204/255, 1., 0.], [180/255, 180/255, 180/255]]).to(device)
background = torch.tensor((1., 1., 1.)).to(device)
vertices = copy.deepcopy(mesh.vertices).to(device)
n_views = 5

In [None]:
def objective(trial):
    # Constrain most sources of randomness
    # (some torch backwards functions within CLIP are non-determinstic)

    # ==== Set Seed for Determinism ====
    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    # Sample hyperparameters
    clip_model_name = trial.suggest_categorical("clip_model", ["ViT-B/32", "ViT-B/16", "ViT-L/14"])
    safe_model_name = clip_model_name.replace("/", "-")
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    network_depth = trial.suggest_int("depth", 3, 8)
    n_augs = trial.suggest_int("n_augs", 0, 5)
    n_views = trial.suggest_int("n_views", 2, 8)
    n_iters = trial.suggest_int("n_iters", 1500, 3000, step=100)

    # Timestamped export path to Google Drive
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"trial_{trial.number}_{safe_model_name}_lr{learning_rate:.1e}_d{network_depth}_v{n_views}_a{n_augs}_i{n_iters}"
    export_path = f"/content/drive/MyDrive/affordance_outputs/optuna_{timestamp}_{run_name}"
    os.makedirs(export_path+"/renders", exist_ok=True)

    # === Load components ===
    model, preprocess = get_clip_model(clip_model_name)
    tokenizer = clip.tokenize

    # Define MLP with trial's depth
    mlp = NeuralHighlighter(num_layers=network_depth).to(device)
    optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)

    losses = []
    start_time = time.time()
    for i in range(n_iters):
        optimizer.zero_grad()
        pred_class = mlp(vertices)

        color_mesh(pred_class, mesh, colors)
        rendered_images, _, _ = render.render_views(
            mesh,
            num_views=n_views,
            show=False,
            center_azim=0,
            center_elev=0,
            std=1,
            return_views=True,
            lighting=True,
            background=background
        )

        loss = clip_loss(rendered_images, prompt, clip_transform, model, tokenizer, device, augment_transform, n_augs)
        loss.backward(retain_graph=True)
        optimizer.step()

        losses.append(loss.item())

        # Save intermediate results every 100 iterations
        if i % 100 == 0:
            save_renders(export_path, i, rendered_images)
            with open(os.path.join(export_path, "log.txt"), "a") as f:
                f.write(f"Iter {i} | Loss: {loss.item():.4f} | Last 100 avg CLIP score: {np.mean(losses[-100:]):.4f}\n")

    total_time = time.time() - start_time
    minutes, seconds = divmod(total_time, 60)

    del model  # if you create a model
    torch.cuda.empty_cache()
    gc.collect()

    # Save final output
    save_final_results(export_path, run_name, mesh, mlp, vertices, colors, render, background)
    with open(os.path.join(export_path, "final_score.txt"), "w") as f:
        f.write(f"Prompt: {prompt}\n")
        f.write(f"Final average CLIP loss: {sum(losses[-100:]) / 100:.4f}\n")
        f.write(f"Total time: {int(minutes)}m {int(seconds)}s\n")
        f.write(str(trial.params))

    return sum(losses[-100:]) / 100  # use average of last 100 iterations as final score

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)