In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install optuna
!pip install trimesh
!pip install open3d
!pip install python-dotenv

# Downgrade numpy to a compatible version
!pip install numpy==1.23.5 --force-reinstall

In [None]:
!git clone https://github.com/ezmi234/Affordance_Highlighting_Project_2024.git

In [None]:
%cd Affordance_Highlighting_Project_2024
!git checkout part3-affordancenet-benchmark

In [None]:
import torch

# Show details
print(f"PyTorch version: {torch.__version__}, CUDA version: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import clip
import kaolin as kal
import trimesh
import numpy as np
import os
from dotenv import load_dotenv
import random
import torch.nn as nn
import torch.nn.functional as F
import torchvision

from datetime import datetime
from kaolin.ops.mesh import face_normals
from Normalization import MeshNormalizer
from mesh import Mesh
from render import Renderer
from tqdm import tqdm
from torchvision import transforms
from utils import color_mesh
import pickle
from utilities.dataset import load_dataset, get_coordinates, get_affordance_label, is_affordance_present, split_dataset_by_class_and_affordance
from utilities.point_cloud import pointcloud_to_voxel_mesh, project_vertex_scores_to_pointcloud, visualize_affordance_pointcloud

from google.colab import drive
drive.mount('/content/drive')
import optuna
import gc
import pandas as pd
import time

In [None]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=256, output_dim=2, num_layers=6):
        """
        Args:
            input_dim: usually 3 (x, y, z)
            hidden_dim: size of hidden layers
            output_dim: 2 for [highlight, gray]
            num_layers: total number of linear layers
        """
        super(NeuralHighlighter, self).__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.LayerNorm(hidden_dim)]

        for _ in range(num_layers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(hidden_dim))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))  # 2-class output

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


def get_clip_model(clipmodel='ViT-L/14', jit=False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clipmodel, device=device, jit=jit)
    print(f"Loaded CLIP model: {clipmodel} on {device} (jit={jit})")
    return model, preprocess


# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')

def clip_loss(rendered_images, text_prompt, clip_transform, clip_model, tokenizer, device, aug_transform=None, n_augs=0):
    """
    Computes the CLIP loss as negative cosine similarity between
    rendered image embeddings and the text prompt embedding.

    Args:
        rendered_images (torch.Tensor): shape (B, 3, H, W)
        text_prompt (str): e.g., "a gray chair with highlighted seat"
        clip_transform (torchvision.transforms): preprocessing for CLIP
        clip_model (torch.nn.Module): preloaded CLIP model
        tokenizer (callable): CLIP tokenizer
        device (str): "cuda" or "cpu"
        aug_transform (torchvision.transforms): augmentation for CLIP
        n_augs (int): number of augmentations to apply
    Returns:
        loss (torch.Tensor): scalar CLIP loss
    """

    loss = 0.0

    # Encode text
    text_tokens = tokenizer([text_prompt]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_tokens).float()
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)  # L2 norm

    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        image_features = clip_model.encode_image(clip_image).float()
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # Cosine similarity
        loss = -torch.mean(torch.cosine_similarity(image_features, text_features))

    else:
        for _ in range(n_augs):
          aug_image = aug_transform(rendered_images)
          image_encoded = clip_model.encode_image(aug_image)
          loss -= torch.mean(torch.cosine_similarity(image_encoded, text_features))

        loss =  loss / n_augs

    return loss


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))

In [None]:
def prepare_dataset(input_path="/content/drive/MyDrive/part3-affordancenet-benchmark/data/",
                    object_class="Knife",
                    affordance="grasp",
                    val_size=5,
                    test_size=5,
                    seed=42):
    os.makedirs(input_path, exist_ok=True)

    # File naming based on class and affordance
    val_filename = f"val_set_{object_class}_{affordance}.pkl"
    test_filename = f"test_set_{object_class}_{affordance}.pkl"

    val_path = os.path.join(input_path, val_filename)
    test_path = os.path.join(input_path, test_filename)

    if not os.path.exists(val_path) or not os.path.exists(test_path):
        print("Dataset files not found in Drive. Checking local storage...")

        if not os.path.exists("data/full-shape/full_shape_train_data.pkl"):
            print("Local dataset not found. Downloading...")
            !gdown 1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF --output full-shape.zip
            !unzip -q full-shape.zip -d data/full-shape
        else:
            print("Local dataset found. Skipping download.")

        # Load and split
        original_dataset = load_dataset("data/full-shape/full_shape_train_data.pkl")
        val_set, test_set = split_dataset_by_class_and_affordance(
            original_dataset, object_class, affordance, val_size=val_size, test_size=test_size,  seed=seed
        )

        # Save to Drive with class and affordance in the name
        with open(val_path, "wb") as f:
            pickle.dump(val_set, f)
        with open(test_path, "wb") as f:
            pickle.dump(test_set, f)

        print(f"Created and saved: {len(val_set)} val / {len(test_set)} test samples for {object_class} - {affordance}")

    else:
        print("Dataset already exists in Drive. Loading...")
        with open(val_path, "rb") as f:
            val_set = pickle.load(f)
        with open(test_path, "rb") as f:
            test_set = pickle.load(f)

        print(f"Loaded from Drive: {len(val_set)} val / {len(test_set)} test samples for {object_class} - {affordance}")

    return val_set, test_set


In [None]:
semantic_class = "Table"
affordance = "support"

In [None]:
val_set, test_set = prepare_dataset(
    object_class=semantic_class,
    affordance=affordance,
    val_size=5,
    seed=42
)


In [None]:
def get_vertex_scores(pred_class: torch.Tensor, positive_class: int = 1):
    """
    Returns vertex-wise confidence scores for the positive class.

    Args:
        pred_class (torch.Tensor): shape [N, 2], softmax logits
        positive_class (int): which class index should be interpreted as affordance (1 or 0)

    Returns:
        torch.Tensor: shape [N], probabilities
    """
    probs = F.softmax(pred_class, dim=1)
    return probs[:, positive_class]

def compute_IoU(pred_labels: torch.Tensor, gt_labels: torch.Tensor):
    pred = pred_labels.bool()
    gt = gt_labels.bool()
    intersection = (pred & gt).sum().float()
    union = (pred | gt).sum().float()
    return (intersection / union).item() if union > 0 else float('nan')

def optimize_IoU_threshold(projected_scores, gt_labels, thresholds=None, gt_thresholds=None):
    """
    Computes IoU over different thresholds on predicted scores and GT thresholds.

    Args:
        projected_scores (torch.Tensor): shape [N], soft prediction per point
        gt_labels (torch.Tensor): shape [N], soft or binary GT labels
        thresholds (list or tensor): list of thresholds to test (default: 0.1 to 0.9)
        gt_thresholds (list or tensor): list of GT thresholds to test (default: 0.0 to 0.5)

    Returns:
        (float, float, float, pd.DataFrame): best prediction threshold, best GT threshold, best IoU, DataFrame of all results
    """
    if thresholds is None:
        thresholds = torch.linspace(0.1, 0.9, steps=9)
    if gt_thresholds is None:
        gt_thresholds = torch.linspace(0.0, 0.5, steps=6)

    records = []
    best_iou = -1
    best_pred_thresh = 0.5
    best_gt_thresh = 0.0

    for gt_t in gt_thresholds:
        gt_binary = (gt_labels > gt_t).long()
        for pred_t in thresholds:
            pred_binary = (projected_scores > pred_t).long()
            iou = compute_IoU(pred_binary, gt_binary)
            records.append({"pred_threshold": float(pred_t), "gt_threshold": float(gt_t), "iou": iou})
            if iou >= best_iou:
                best_iou = iou
                best_pred_thresh = float(pred_t)
                best_gt_thresh = float(gt_t)

    return best_pred_thresh, best_gt_thresh, best_iou, pd.DataFrame(records)

In [None]:
# === Memory Cleanup Function ===
def clean_memory(*objects):
    """
    Frees GPU memory and runs garbage collection
    """
    for obj in objects:
        del obj
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def optimize_highlighting(sample, affordance, mlp, optimizer, render, clip_model, tokenizer,
                        clip_transform, augment_transform, n_views, n_augs, n_iter, colors, background,
                        output_dir, device, run_n, prompt, resolution=16, voxel_threshold=0.5):

    # ==== Set Seed for Determinism ====
    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    export_path = os.path.join(output_dir, f"run_{run_n}_{str(sample['semantic class']).lower()}_{affordance}")
    os.makedirs(os.path.join(export_path, "renders"), exist_ok=True)
    temp_obj_path = "data/outputDemo.obj"

    if "semantic class" not in sample or not isinstance(sample["semantic class"], str):
        raise ValueError(f"Error: Missing or invalid 'semantic class' field in sample: {sample}")

    points = get_coordinates(sample, device)
    trimesh_mesh = pointcloud_to_voxel_mesh(
        points,
        resolution=resolution,
        threshold=voxel_threshold,
        export_path=temp_obj_path
    )

    sampled_mesh = Mesh(temp_obj_path)
    MeshNormalizer(sampled_mesh)()
    vertices = sampled_mesh.vertices.clone().detach().to(device).float()

    losses = []
    pred_class = None
    start_time = time.time()
    for i in tqdm(range(n_iter)):
        optimizer.zero_grad()
        pred_class = mlp(vertices)
        color_mesh(pred_class, sampled_mesh, colors)

        rendered_images, elev, azim = render.render_views(
            sampled_mesh,
            num_views=n_views,
            show=False,
            center_azim=0,
            center_elev=0,
            std=1,
            return_views=True,
            lighting=True,
            background=background
        )

        loss = clip_loss(rendered_images, prompt, clip_transform, clip_model, tokenizer, device, augment_transform, n_augs)
        loss.backward(retain_graph=True)
        optimizer.step()

        with torch.no_grad():
            losses.append(loss.item())

        if i % 100 == 0:
            # print(f"Last 100 CLIP score: {np.mean(losses[-100:])}")
            save_renders(export_path, i, rendered_images)
            with open(os.path.join(export_path, "log.txt"), "a") as f:
                f.write(f"Iter {i} | Loss: {loss.item():.4f} | Last 100 avg CLIP score: {np.mean(losses[-100:]):.4f}\n")

    total_time = time.time() - start_time
    minutes, seconds = divmod(total_time, 60)

    torch.cuda.empty_cache()
    gc.collect()

    # Save final output
    save_final_results(export_path, sample["semantic class"], sampled_mesh, mlp, vertices, colors, render, background)
    with open(os.path.join(export_path, "final_score.txt"), "w") as f:
        f.write(f"Prompt: {prompt}\n")
        f.write(f"Final average CLIP loss: {sum(losses[-10:]) / 10:.4f}\n")
        f.write(f"Total time: {int(minutes)}m {int(seconds)}s\n")
        f.write(f"Resolution: {resolution}\n")
        f.write(f"Voxel threshold: {voxel_threshold}\n")
        f.write(f"Number of views: {n_views}\n")
        f.write(f"Number of augmentations: {n_augs}\n")
        f.write(f"Number of iterations: {n_iter}\n")
        f.write(f"Learning rate: {optimizer.param_groups[0]['lr']}\n")

    if os.path.exists(temp_obj_path):
        os.remove(temp_obj_path)

    return trimesh_mesh, pred_class, export_path

In [None]:
# Optuna objective function for affordance highlighting validation

def objective(trial):

    # === Sample hyperparameters ===
    clip_model_name = trial.suggest_categorical("clip_model", ["ViT-B/32", "ViT-B/16", "ViT-L/14"])
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    network_depth = trial.suggest_int("depth", 3, 8)
    n_augs = trial.suggest_int("n_augs", 0, 5)
    n_views = trial.suggest_int("n_views", 2, 8)
    n_iters = trial.suggest_int("n_iter", 1600, 2800, step=100)
    resolution = trial.suggest_int("resolution", 8, 24, step=8)
    threshold = trial.suggest_float("voxel_thresh", 0.1, 0.9)
    prompt_template = trial.suggest_categorical("prompt", [
        "A 3D render of a gray {} with highlighted {}",
        "A gray {} with highlighted {}",
        "A gray {} with highlighted {} region",
        "An highlighted {} region with gray {}"
    ])

    # ==== Device ====
    render_res = 224
    render = Renderer(dim=(render_res, render_res))

    # ==== Normalization and Augmentation ====
    clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

    clip_transform = transforms.Compose([
        transforms.Resize((render_res, render_res)),
        clip_normalizer
    ])

    augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(render_res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

    # ==== Colors and Other Constants ====
    colors = torch.tensor([[204/255, 1., 0.], [180/255, 180/255, 180/255]]).to(device)
    background = torch.tensor((1., 1., 1.)).to(device)

    # === Setup output directory ===
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"trial_{trial.number}_{clip_model_name.replace('/', '-')}_res{resolution}_th{threshold:.2f}_lr{learning_rate:.0e}_depth{network_depth}_views{n_views}_augs{n_augs}_iters{n_iters}"
    export_path = f"/content/drive/MyDrive/part3-affordancenet-benchmark/outputs/{run_name}"

    # === Load models and transforms ===
    clip_model, _ = get_clip_model(clip_model_name)
    tokenizer = clip.tokenize

    # === Render and MLP ===
    render_res = 224
    render = Renderer(dim=(render_res, render_res))

    total_ious = []
    results = []

    cont = 0;
    for sample in val_set:
        if is_affordance_present(sample, affordance):
            if prompt_template == "An highlighted {} region with gray {}":
              prompt = prompt_template.format(affordance.lower(), str(sample['semantic class']).lower())
            else:
              prompt = prompt_template.format(str(sample['semantic class']).lower(), affordance.lower())

            mlp = NeuralHighlighter(num_layers=network_depth).to(device)
            optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)

            mesh, pred_class, render_dir = optimize_highlighting(
                sample=sample,
                affordance=affordance,
                mlp=mlp,
                optimizer=optimizer,
                render=render,
                clip_model=clip_model,
                tokenizer=tokenizer,
                clip_transform=clip_transform,
                augment_transform=augment_transform,
                n_views=n_views,
                n_augs=n_augs,
                n_iter=n_iters,
                colors=colors,
                background=background,
                run_n=cont,
                output_dir=export_path,
                device=device,
                prompt=prompt,
                resolution=resolution,
                voxel_threshold=threshold,
            )

            pointcloud = get_coordinates(sample, device)
            gt_labels = get_affordance_label(sample, affordance, device)
            vertex_scores = get_vertex_scores(pred_class, positive_class=0)
            projected_scores = project_vertex_scores_to_pointcloud(mesh, vertex_scores, pointcloud)

            best_pred_thresh, best_gt_thresh, best_iou, iou_df = optimize_IoU_threshold(
                projected_scores,
                gt_labels,
                thresholds=torch.linspace(0.1, 0.9, steps=9),
                gt_thresholds=torch.linspace(0.0, 0.45, steps=9)
            )

            trial.report(best_iou, step=cont)
            if trial.should_prune():
              clean_memory(mlp, optimizer, render, clip_model, tokenizer, pointcloud, gt_labels, vertex_scores)
              raise optuna.exceptions.TrialPruned()

            total_ious.append(best_iou)

            results.append({
                **trial.params,
                "sample_class": sample['semantic class'],
                "affordance": affordance,
                "best_iou": best_iou,
                "best_pred_threshold": best_pred_thresh,
                "best_gt_threshold": best_gt_thresh
            })

            with open(os.path.join(render_dir, f"projected_scores_{sample['semantic class']}.pkl"), "wb") as f:
                pickle.dump(projected_scores.detach().cpu().numpy(), f)
            iou_df.to_csv(os.path.join(render_dir, f"iou_by_threshold_{sample['semantic class']}.csv"), index=False)

            cont += 1

    df = pd.DataFrame(results)
    df.to_csv(os.path.join(export_path, "metrics.csv"), index=False)

    mean_iou = np.nanmean(total_ious)
    print(f"Trial {trial.number} finished with mean IoU: {mean_iou:.4f}")
    return mean_iou

In [None]:
export_path = "/content/drive/MyDrive/part3-affordancenet-benchmark/"
db_folder = os.path.join(export_path, "db")
os.makedirs(db_folder, exist_ok=True)

# Use local storage
db_path = os.path.join(db_folder, "optuna_{}_{}.db".format(semantic_class.lower(), affordance))
storage_url = f"sqlite:///{db_path}"

# Use a remote PostgreSQL database for Optuna storage
# env_path = "/content/drive/MyDrive/.env"
# load_dotenv(dotenv_path=env_path)
# storage_url = os.getenv("STORAGE_URL")

study = optuna.create_study(
    direction="maximize",
    study_name="affordance_highlighting_part3",
    storage=storage_url,
    load_if_exists=True
)
study.optimize(objective, n_trials=20)

os.makedirs(os.path.join(export_path, "outputs"), exist_ok=True)
pd.DataFrame([study.best_params]).to_csv(
    os.path.join(export_path, "outputs", "best_hyperparameters.csv"), index=False
)

# Print the best trial
print("Best trial:")
print(study.best_trial)

In [None]:
## Clear torch
torch.cuda.empty_cache()
gc.collect()