<a href="https://colab.research.google.com/github/francescasaglimbeni/Project-Affordance-Highlighting/blob/main/Project_Affordance_Highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PROJECT**

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install transforms3d
!pip install gdown
!pip install open3d
!pip install torch
!pip install numpy
!pip install tqdm
!pip install clip
!pip install pytorch3d

In [None]:
import os
import pathlib
import copy
import json
import random
import numpy as np
import torch
import torch.nn as nn
import clip
import kaolin as kal
import kaolin.ops.mesh
import torchvision
import open3d as o3d
import gdown
from torch.utils.data import Dataset
from tqdm import tqdm
from torchvision import transforms
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import List, Tuple, Optional, Union
from pathlib import Path
from os.path import join as opj
import h5py
import pickle as pkl
import shutil
from scipy.spatial import KDTree

In [None]:
!rm -rf /content/output_PART3
!rm -rf /content/Affordance_Highlighting_Project_2024/output

## **Creazione cartella progetto + definizione rete + clip loss**

In [None]:
repo_path = pathlib.Path('./Affordance_Highlighting_Project_2024')
if not repo_path.is_dir():
    os.system('git clone https://github.com/paolotron/Affordance_Highlighting_Project_2024.git')

%cd ./Affordance_Highlighting_Project_2024/
!ls

In [None]:
torch.cuda.empty_cache()

from collections import defaultdict
from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh


class FourierFeatureTransform(nn.Module):
    def __init__(self, input_dim, width, sigma):
        super(FourierFeatureTransform, self).__init__()
        self.sigma = sigma
        self.linear = nn.Linear(input_dim, width)

    def forward(self, x):
        x = self.linear(x)
        x = torch.sin(x * self.sigma)
        return x

class NeuralHighlighter(nn.Module):
    def __init__(self,  out_dim = 2, depth = 4, width=256, input_dim=3, positional_encoding=False, sigma=5.0):
        super(NeuralHighlighter, self).__init__()

        layers = []

        if positional_encoding:
            layers.append(FourierFeatureTransform(input_dim, width, sigma))
            layers.append(nn.Linear(width * 2 + input_dim, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        else:
            layers.append(nn.Linear(input_dim, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))


        for i in range(depth):
            layers.append(nn.Linear(width, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))

        layers.append(nn.Linear(width, out_dim))
        layers.append(nn.Softmax(dim=1))

        self.mlp = nn.ModuleList(layers)
        print(self.mlp)

    def forward(self, x):
        for layer in self.mlp:
            x = layer(x)
        return x


device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

def get_clip_model(clip_model):
    # Usa open-clip per caricare il modello
    clip_model, preprocess = clip.load(clip_model, device=device)
    return clip_model, preprocess

In [None]:
def clip_loss(rendered_images, clip_model, encoded_text, clip_transform, augment_transform,  n_augs, clipavg):
    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)
        if clipavg == "view":
            if encoded_text.shape[0] > 1:
                loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
        else:
            loss = torch.mean(torch.cosine_similarity(encoded_renders, encoded_text))
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)
            if clipavg == "view":
                if encoded_text.shape[0] > 1:
                    loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                    torch.mean(encoded_text, dim=0), dim=0)
                else:
                    loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                    encoded_text)
            else:
                loss -= torch.mean(torch.cosine_similarity(encoded_renders, encoded_text))
    return loss

In [None]:
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')

In [None]:
def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))

## **PART 1 - PIPELINE**

In [None]:
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
learning_rate = 0.0001
n_iter = 2500
res = 224
obj_path = '/content/Affordance_Highlighting_Project_2024/data/dog.obj'
n_augs = 5
output_dir = './output/'
clip_model = 'ViT-B/32'
clipavg = 'view'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
       transforms.Resize((res, res)),
       clip_normalizer
   ])
augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

mlp = NeuralHighlighter().to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)



clip_model, preprocess = get_clip_model(clip_model)
#PROMPT:
prompt = "A 3D render of a gray dog with highlighted hat"
with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = clip_model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

name_p = 'dog'
vertices = copy.deepcopy(mesh.vertices)

n_views = 5

losses = []

# OOPTIMIZED LOOP:
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    pred_class = mlp(vertices)

    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(rendered_images, clip_model, encoded_text, clip_transform, augment_transform, n_augs, clipavg)
    loss.backward(retain_graph=True)

    optim.step()

    with torch.no_grad():
        losses.append(loss.item())

    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

save_final_results(log_dir, name_p, mesh, mlp, vertices, colors, render, background)

with open(os.path.join(log_dir, 'prompt.txt'), "w") as f:
    f.write('')

## **PART 2 - PIPELINE**

### **POINTCLOUD TO MESH**

In [None]:
output_directory = './pointcloud'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

mesh = o3d.io.read_triangle_mesh("/content/Affordance_Highlighting_Project_2024/data/horse.obj")

point_cloud = mesh.sample_points_uniformly(number_of_points=50000)

point_cloud.remove_statistical_outlier(nb_neighbors=25, std_ratio=2.5)

point_cloud.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=40))

o3d.visualization.draw_geometries([point_cloud])

ply_output_path = os.path.join(output_directory, "horse.ply")
o3d.io.write_point_cloud(ply_output_path, point_cloud)

points = np.asarray(point_cloud.points)

kdtree = KDTree(points)
radius = 0.02
densities = np.array([len(kdtree.query_ball_point(p, radius)) for p in points])

densities_normalized = (densities - densities.min()) / (densities.max() - densities.min())

In [None]:
def point_cloud_to_mesh(ply_path, obj_output_path, depth=9):
    pcd = o3d.io.read_point_cloud(ply_path)
    print(f"Loaded point cloud with {len(pcd.points)} points.")


    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(pcd, depth=depth)
    densities = np.asarray(densities)
    vertices_to_remove = densities < np.quantile(densities, 0.001)
    mesh.remove_vertices_by_mask(vertices_to_remove)

    # Compute vertex normals
    mesh.compute_vertex_normals()
    mesh = mesh.filter_smooth_simple(number_of_iterations=5)

    o3d.io.write_triangle_mesh(obj_output_path, mesh)
    print(f"Mesh saved to {obj_output_path}")


point_cloud_path = "/content/Affordance_Highlighting_Project_2024/pointcloud/horse.ply"

##where is the mesh that we have to use in pipeline:
mesh_output_path = "/content/Affordance_Highlighting_Project_2024/data/point_cloud.obj"

point_cloud_to_mesh(point_cloud_path, mesh_output_path, depth = 9)

### **PIPELINE**

In [None]:
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

render_res = 224
learning_rate = 0.0001
n_iter = 2500
res = 224
obj_path = '/content/Affordance_Highlighting_Project_2024/data/point_cloud.obj'
n_augs = 5
output_dir = './output/'
clip_model = 'ViT-B/32'
clipavg = 'view'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
       transforms.Resize((res, res)),
       clip_normalizer
   ])
augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

mlp = NeuralHighlighter().to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)

clip_model, preprocess = get_clip_model(clip_model)

# PROMPT:
prompt = "A 3D render of a grey horse with highlighted necklace"
with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = clip_model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

name_p = 'horse'
vertices = copy.deepcopy(mesh.vertices)

n_views = 5

losses = []

#OPTIMIZATION LOOP:
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    pred_class = mlp(vertices)

    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(rendered_images, clip_model, encoded_text, clip_transform, augment_transform, n_augs, clipavg)
    loss.backward(retain_graph=True)

    optim.step()

    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)

        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

save_final_results(log_dir, name_p, mesh, mlp, vertices, colors, render, background)

with open(os.path.join(log_dir, 'prompt.txt'), "w") as f:
    f.write('')

## **PART 3 - PIPELINE**

### **AFFORDANCENET + DATASET**

In [None]:
repo_path = pathlib.Path('AffordanceNet')
if not repo_path.is_dir():
    os.system('git clone https://github.com/Gorilla-Lab-SCUT/AffordanceNet.git')

In [None]:
file_id = '1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF'

url = f'https://drive.google.com/uc?export=download&id={file_id}'

gdown.download(url, 'dataset.zip', quiet=False)
!unzip dataset.zip -d /content/dataset

We have to move provider.py to solve problems with import:

In [None]:
source = "/content/Affordance_Highlighting_Project_2024/AffordanceNet/utils/provider.py"

destination = "/content/Affordance_Highlighting_Project_2024"

os.makedirs(destination, exist_ok=True)

try:
    shutil.move(source, destination)
    print(f"File moved in: {destination}")
except FileNotFoundError:
    print("Error file doesn't exist")
except Exception as e:
    print(f"Error: {e}")

In [None]:
%cd Affordance_Highlighting_Project_2024
from provider import rotate_point_cloud_SO3, rotate_point_cloud_y

def pc_normalize(pc):
    centroid = np.mean(pc, axis=0)
    pc = pc - centroid
    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
    pc = pc / m
    return pc, centroid, m


def semi_points_transform(points):
    spatialExtent = np.max(points, axis=0) - np.min(points, axis=0)
    eps = 2e-3*spatialExtent[np.newaxis, :]
    jitter = eps*np.random.randn(points.shape[0], points.shape[1])
    points_ = points + jitter
    return points_


class AffordNetDataset(Dataset):
    def __init__(self, data_dir, split, partial=False, rotate='None', semi=False):
        super().__init__()
        self.data_dir = data_dir
        self.split = split

        self.partial = partial
        self.rotate = rotate
        self.semi = semi

        self.load_data()

        self.affordance = self.all_data[0]["affordance"]

        return

    def load_data(self):
        self.all_data = []
        if self.semi:
            with open(opj(self.data_dir, 'semi_label_1.pkl'), 'rb') as f:
                temp_data = pkl.load(f)
        else:
            if self.partial:
                with open(opj(self.data_dir, 'partial_%s_data.pkl' % self.split), 'rb') as f:
                    temp_data = pkl.load(f)
            elif self.rotate != "None" and self.split != 'train':
                with open(opj(self.data_dir, 'rotate_%s_data.pkl' % self.split), 'rb') as f:
                    temp_data_rotate = pkl.load(f)
                with open(opj(self.data_dir, 'full_shape_%s_data.pkl' % self.split), 'rb') as f:
                    temp_data = pkl.load(f)
            else:
                with open(opj(self.data_dir, 'full_shape_%s_data.pkl' % self.split), 'rb') as f:
                    temp_data = pkl.load(f)
        for index, info in enumerate(temp_data):
            if self.partial:
                partial_info = info["partial"]
                for view, data_info in partial_info.items():
                    temp_info = {}
                    temp_info["shape_id"] = info["shape_id"]
                    temp_info["semantic class"] = info["semantic class"]
                    temp_info["affordance"] = info["affordance"]
                    temp_info["view_id"] = view
                    temp_info["data_info"] = data_info
                    self.all_data.append(temp_info)
            elif self.split != 'train' and self.rotate != 'None':
                rotate_info = temp_data_rotate[index]["rotate"][self.rotate]
                full_shape_info = info["full_shape"]
                for r, r_data in rotate_info.items():
                    temp_info = {}
                    temp_info["shape_id"] = info["shape_id"]
                    temp_info["semantic class"] = info["semantic class"]
                    temp_info["affordance"] = info["affordance"]
                    temp_info["data_info"] = full_shape_info
                    temp_info["rotate_matrix"] = r_data.astype(np.float32)
                    self.all_data.append(temp_info)
            else:
                temp_info = {}
                temp_info["shape_id"] = info["shape_id"]
                temp_info["semantic class"] = info["semantic class"]
                temp_info["affordance"] = info["affordance"]
                temp_info["data_info"] = info["full_shape"]
                self.all_data.append(temp_info)

    def __getitem__(self, index):

        data_dict = self.all_data[index]
        modelid = data_dict["shape_id"]
        modelcat = data_dict["semantic class"]

        data_info = data_dict["data_info"]
        model_data = data_info["coordinate"].astype(np.float32)
        labels = data_info["label"]
        for aff in self.affordance:
            temp = labels[aff].astype(np.float32).reshape(-1, 1)
            model_data = np.concatenate((model_data, temp), axis=1)

        datas = model_data[:, :3]
        targets = model_data[:, 3:]

        if self.rotate != 'None':
            if self.split == 'train':
                if self.rotate == 'so3':
                    datas = rotate_point_cloud_SO3(
                        datas[np.newaxis, :, :]).squeeze()
                elif self.rotate == 'z':
                    datas = rotate_point_cloud_y(
                        datas[np.newaxis, :, :]).squeeze()
            else:
                r_matrix = data_dict["rotate_matrix"]
                datas = (np.matmul(r_matrix, datas.T)).T

        datas, _, _ = pc_normalize(datas)

        return datas, datas, targets, modelid, modelcat

    def __len__(self):
        return len(self.all_data)

### **PIPELINE**

In [None]:
def optimize_highlighted_regions(
    obj_path,
    prompt,
    output_dir='./output/',
    clip_model_name='ViT-B/32',
    render_res=224,
    res=224,
    learning_rate=0.0001,
    n_iter=2500,
    n_views=5,
    n_augs=5,
    device='cuda',
    mesh_id=None,
    is_validation=False
):

    seed = 0
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    os.makedirs(os.path.join(output_dir, 'renders'), exist_ok=True)

    objbase, extension = os.path.splitext(os.path.basename(obj_path))

    render = Renderer(dim=(render_res, render_res))
    mesh = Mesh(obj_path)
    MeshNormalizer(mesh)()

    clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    clip_transform = transforms.Compose([
        transforms.Resize((res, res)),
        clip_normalizer
    ])
    augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

    clip_model, preprocess = get_clip_model(clip_model_name)
    with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = clip_model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

    mlp = NeuralHighlighter().to(device)
    optim = torch.optim.Adam(mlp.parameters(), learning_rate)

    rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
    color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
    full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
    colors = torch.tensor(full_colors).to(device)

    background = torch.tensor((1., 1., 1.)).to(device)

    vertices = copy.deepcopy(mesh.vertices)
    losses = []

    # OPTIMIZATION LOOP:
    for i in tqdm(range(n_iter)):
        optim.zero_grad()

        pred_class = mlp(vertices)

        sampled_mesh = mesh
        color_mesh(pred_class, sampled_mesh, colors)
        rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                          show=False,
                                                          center_azim=0,
                                                          center_elev=0,
                                                          std=1,
                                                          return_views=True,
                                                          lighting=True,
                                                          background=background)

        loss = clip_loss(rendered_images, clip_model, encoded_text, clip_transform, augment_transform, n_augs, clipavg='view')
        loss.backward(retain_graph=True)

        optim.step()

        with torch.no_grad():
            losses.append(loss.item())

        if is_validation and i % 100 == 0:
            save_renders_PART3(output_dir, i, rendered_images, mesh_id=mesh_id)
            with open(os.path.join(output_dir, "training_info.txt"), "a") as f:
                f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

    if is_validation:
        save_final_results_PART3(output_dir, objbase, mesh, mlp, vertices, colors, render, background, mesh_id = mesh_id)


    return pred_class

In [None]:
def point_cloud_to_mesh_part3(points, output_path):
    try:
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(points)

        cl, ind = pcd.remove_statistical_outlier(nb_neighbors=25, std_ratio=2.5)
        pcd = pcd.select_by_index(ind)

        pcd = pcd.voxel_down_sample(voxel_size=0.003)

        pcd.estimate_normals(
            search_param=o3d.geometry.KDTreeSearchParamHybrid(
                radius=0.1,
                max_nn=40
            )
        )

        alpha = 0.05
        mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_alpha_shape(pcd, alpha)

        mesh.compute_vertex_normals()
        mesh = mesh.filter_smooth_simple(number_of_iterations=5)

        o3d.io.write_triangle_mesh(output_path, mesh)
        return True, mesh
    except Exception as e:
        print(f"Mesh creation error: {str(e)}")
        return False, None

In [None]:
def calculate_miou(predictions, targets, model_cat, affordance):
    try:
        if isinstance(predictions, np.ndarray):
            predictions = torch.from_numpy(predictions)
        if isinstance(targets, np.ndarray):
            targets = torch.from_numpy(targets)

        predictions = predictions.cpu().float()
        targets = targets.cpu().float()

        if len(predictions.shape) > 1:
            predictions = predictions[:, 0]
        if len(targets.shape) > 1:
            targets = targets[:, 0]

        if predictions.shape[0] != targets.shape[0]:
            predictions = torch.nn.functional.interpolate(
                predictions.unsqueeze(0).unsqueeze(0),
                size=targets.shape[0],
                mode='linear'
            ).squeeze()

        pred_mask = (predictions > 0.5).float()
        target_mask = (targets > 0.5).float()

        pred_mask = pred_mask.to(torch.uint8)
        target_mask = target_mask.to(torch.uint8)

        intersection = (pred_mask & target_mask).sum().item()
        union = (pred_mask | target_mask).sum().item()

        iou = intersection / union if union > 0 else 0.0

        print(f"Calculated mIoU for {model_cat} ({affordance}): {iou:.4f}")
        return iou
    except Exception as e:
        print(f"Error calculating mIoU: {str(e)}")
        return 0.0

In [None]:
def save_renders_PART3(dir, i, rendered_images, name=None, mesh_id=None):
    if mesh_id:
        specific_dir = os.path.join(dir, mesh_id, 'renders')
    else:
        specific_dir = os.path.join(dir, 'renders')

    os.makedirs(specific_dir, exist_ok=True)

    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(specific_dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(specific_dir, f'iter_{i}.jpg'))

def save_final_results_PART3(log_dir, name, mesh, mlp, vertices, colors, render, background, mesh_id=None):
    if mesh_id:
        log_dir = os.path.join(log_dir, mesh_id)

    os.makedirs(log_dir, exist_ok=True)

    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)

        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)

        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)

        color_mesh(one_hot, sampled_mesh, colors)

        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                  show=False,
                                                  center_azim=0,
                                                  center_elev=0,
                                                  std=1,
                                                  return_views=True,
                                                  lighting=True,
                                                  background=background)

        torchvision.utils.save_image(rendered_images, os.path.join(log_dir, 'final_render.jpg'))

        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)

In [None]:
def process_single_object(data, targets, model_id, config, validation=False):
    try:
        mesh_path = os.path.join(
            config['output_meshes_dir'],
            f"{config['target_category']}_{model_id}_mesh.obj"
        )

        if not point_cloud_to_mesh_part3(data, mesh_path)[0]:
            return None

        prompt = f"a 3D rendering of {config['target_category']} show the region highlited optimized for listening"

        if validation:
            current_mesh_id = f"{config['target_category']}_{model_id}"
            output_dir = os.path.join(config['output_results_dir'], current_mesh_id)
            os.makedirs(output_dir, exist_ok=True)
            os.makedirs(os.path.join(output_dir, 'renders'), exist_ok=True)
        else:
            output_dir = config['output_results_dir']
            current_mesh_id = None

        pred_class = optimize_highlighted_regions(
            obj_path=mesh_path,
            prompt=prompt,
            output_dir=config['output_results_dir'],
            clip_model_name=config['clip_model_name'],
            render_res=config['render_res'],
            learning_rate=config['learning_rate'],
            n_iter=config['n_iter'],
            n_views=config['n_views'],
            n_augs=config['n_augs'],
            device=config['device'],
            mesh_id=current_mesh_id,
            is_validation=validation
        )

        miou = calculate_miou(
            predictions=pred_class,
            targets=targets[:,:2],
            model_cat=config['target_category'],
            affordance='Listen'
        )

        return miou
    except Exception as e:
        print(f"Error processing object {model_id}: {str(e)}")
        return None

In [None]:
def run_pipeline(config, dataset, dataset_val):
    print(f"Starting optimization for {config['target_category']}")

    # Train phase
    train_results = []
    max_train_samples = 3
    train_indices = [idx for idx, (_, _, _, _, model_cat) in enumerate(dataset)
                    if model_cat == config['target_category']]
    train_samples = random.sample(train_indices, min(len(train_indices), max_train_samples))

    for idx in train_samples:
        datas, _, targets, model_id, model_cat = dataset[idx]
        if model_cat == config['target_category']:
            result = process_single_object(datas, targets, model_id, config, validation=False)
            if result is not None:
                result_string = f"{model_id}: {result}"
                train_results.append(result_string)

    # Validation phase
    val_results = []
    val_indices = [idx for idx, (_, _, _, _, model_cat) in enumerate(dataset_val)
                  if model_cat == config['target_category']]
    val_samples = random.sample(val_indices, min(len(val_indices), 3))

    for idx in val_samples:
        datas, _, targets, model_id, model_cat = dataset_val[idx]
        if model_cat == config['target_category']:
            result = process_single_object(datas, targets, model_id, config, validation=True)
            if result is not None:
                result_string = f"{model_id}: {result}"
                val_results.append(result_string)

    results = {
        'train_results': train_results,
        'val_results': val_results
    }

    with open(os.path.join(config['output_dir'], 'optimization_results.json'), 'w') as f:
        json.dump(results, f, indent=2)

    return train_results, val_results

In [None]:
config = {
    'data_dir': "/content/dataset",
    'output_dir': "/content/output_PART3",
    'output_meshes_dir': "/content/output_PART3/meshes",
    'output_results_dir': "/content/output_PART3/results",
    'target_category': "Earphone",
    'clip_model_name': 'ViT-B/32',
    'render_res': 224,
    'learning_rate': 0.0001,
    'n_iter': 2500,
    'n_views': 5,
    'n_augs': 5,
    'device': "cuda" if torch.cuda.is_available() else "cpu"
}

# Create output directories
os.makedirs(config['output_meshes_dir'], exist_ok=True)
os.makedirs(config['output_results_dir'], exist_ok=True)

# Load datasets
dataset_train = AffordNetDataset(data_dir=config['data_dir'], split='train')
dataset_val = AffordNetDataset(data_dir=config['data_dir'], split='val')

# Run pipeline
train_results, val_results = run_pipeline(config, dataset_train, dataset_val)
print(f"train: {train_results}")
print(f"val: {val_results}")

## **Extension - Training Augmentation**

This is the new function *optimized_highlited_regions* where is applied the extension choosen, where we try different augmentation and calculate new mIoU

In [None]:
def optimize_highlighted_regions(
    obj_path,
    prompt,
    output_dir='./output/',
    clip_model_name='ViT-B/32',
    render_res=224,
    res=224,
    learning_rate=0.0001,
    n_iter=2500,
    n_views=5,
    n_augs=5,
    device='cuda',
    mesh_id=None,
    is_validation=False
):

    seed = 0
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


    os.makedirs(os.path.join(output_dir, 'renders'), exist_ok=True)

    objbase, extension = os.path.splitext(os.path.basename(obj_path))

    render = Renderer(dim=(render_res, render_res))
    mesh = Mesh(obj_path)
    MeshNormalizer(mesh)()

    clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    clip_transform = transforms.Compose([
        transforms.Resize((res, res)),
        clip_normalizer
    ])

    # Initialize CLIP model
    augment_params = {
    'scale': (0.8, 1.0),
    'brightness': 0.4,
    'contrast': 0.4,
    'saturation': 0.4,
    'hue': 0.2,
    'distortion_scale': 0.5
    }

    augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(res, scale=augment_params['scale']),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=augment_params['distortion_scale']),
        transforms.ColorJitter(
            brightness=augment_params['brightness'],
            contrast=augment_params['contrast'],
            saturation=augment_params['saturation'],
            hue=augment_params['hue']
        ),
        transforms.GaussianBlur(kernel_size=(5, 5), sigma=(0.1, 2.0)),
        clip_normalizer
    ])


    clip_model, preprocess = get_clip_model(clip_model_name)
    with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = clip_model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

    # MLP Settings
    mlp = NeuralHighlighter().to(device)
    optim = torch.optim.Adam(mlp.parameters(), learning_rate)

    rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
    color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
    full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
    colors = torch.tensor(full_colors).to(device)

    background = torch.tensor((0., 0., 0.)).to(device)

    vertices = copy.deepcopy(mesh.vertices)
    losses = []

    # OOPTIMIZATION LOOP:
    for i in tqdm(range(n_iter)):
        optim.zero_grad()

        pred_class = mlp(vertices)

        sampled_mesh = mesh
        color_mesh(pred_class, sampled_mesh, colors)
        rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                          show=False,
                                                          center_azim=0,
                                                          center_elev=0,
                                                          std=1,
                                                          return_views=True,
                                                          lighting=True,
                                                          background=background)

        # Calculate CLIP Loss
        loss = clip_loss(rendered_images, clip_model, encoded_text, clip_transform, augment_transform, n_augs, clipavg='view')
        loss.backward(retain_graph=True)

        optim.step()

        with torch.no_grad():
            losses.append(loss.item())

        if is_validation and i % 100 == 0:
            save_renders_PART3(output_dir, i, rendered_images, mesh_id=mesh_id)
            with open(os.path.join(output_dir, "training_info.txt"), "a") as f:
                f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

    if is_validation:
        save_final_results_PART3(output_dir, objbase, mesh, mlp, vertices, colors, render, background, mesh_id = mesh_id)


    return pred_class