In [None]:
# import open3d as o3d
import torch
import numpy as np
import open3d as o3d
from pathlib import Path

import cv2
import numpy as np

from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from pathlib import Path

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Capturing Object from different views

In [None]:
bunny = o3d.io.read_triangle_mesh("/root/multiview-robust-clip/data/shapes/bunny.obj")
bunny.compute_vertex_normals()

In [None]:
%%capture
bunny = o3d.io.read_triangle_mesh("/root/multiview-robust-clip/data/shapes/bunny.obj")
bunny.compute_vertex_normals()

angle = 5

rot_matrix = np.array([[np.cos(np.radians(angle)), 0, -np.sin(np.radians(angle))],
                             [0, 1, 0],
                             [np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]])

current_angle = 0  
for i in range(360 // angle):
    bunny.rotate(rot_matrix)
    vis = o3d.visualization.Visualizer()
    vis.create_window()
    vis.add_geometry(bunny)
    vis.update_geometry(bunny)
    vis.capture_screen_image(f'cameraparams_{current_angle:03d}.png', do_render=True)

    current_angle += angle


In [None]:
o3d.visualization.draw_plotly([bunny])

# Evaluating CLIP features

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = torch.device('cuda')

image_path = Path("/root/multiview-robust-clip/data/renderings")

images = [Image.open(x) for x in sorted(image_path.iterdir())]

inputs = processor(text=["a photo of the stanford bunny"], images=images, return_tensors="pt", padding=False).to(device)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [None]:
image_embeds = outputs['image_embeds'].detach().cpu().numpy()
cosine_similarities = cosine_similarity(image_embeds, image_embeds)
euclidean_distances = euclidean_distances(image_embeds, image_embeds)
# CALCULATE SIMILARITIES
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(cosine_similarities, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Cosine Similarity')

plt.subplot(1, 2, 2)
plt.imshow(euclidean_distances, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Euclidean Distance')

plt.show()

In [None]:
# MAKE VIDEO OF THE 360 DEGREE ROTATION

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output.mp4', fourcc, 20.0, (1920, 1080))


# Load images and write to video
for image in images:
    image_np = np.array(image)
    out.write(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))

# Release the VideoWriter object
out.release()


# Analyze Objaverse data

In [None]:
renderings = Path("/root/multiview-robust-clip/data/objaverse/renderings")

In [None]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity

cossim_list = list()
for shape in renderings.iterdir():
    embeddings = list()
    for pt in sorted(shape.glob("*.pt")):
        embeddings.append(torch.load(pt))
    embeddings = torch.stack(embeddings)
    simm = pairwise_cosine_similarity(embeddings)
    cossim_list.append(simm)

In [None]:
all_sim = torch.stack(cossim_list)
mean_sim = all_sim.mean(dim=0)
std_sim = all_sim.std(dim=0)

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(mean_sim, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Mean CosSim')

plt.subplot(1, 2, 2)
plt.imshow(std_sim, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('STD CosSim')

plt.show()

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(mean_sim[:36, :36], cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Mean CosSim')

plt.subplot(1, 2, 2)
plt.imshow(std_sim[:36, :36], cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('STD CosSim')

plt.show()

In [None]:
device = torch.device('cuda')
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

sample_image = renderings / "323851f10fd7483aa803594767ba693a"

images = [Image.open(x) for x in sorted(sample_image.glob("*.png"))]

inputs = processor(text=["test test 123"], images=images, return_tensors="pt", padding=False).to(device)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [None]:
image_embeds = outputs['image_embeds'].detach().cpu().numpy()
cosine_similarities = pairwise_cosine_similarity(image_embeds)
euc_dist = euclidean_distances(image_embeds, image_embeds)
# CALCULATE SIMILARITIES
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(cosine_similarities, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Cosine Similarity')

plt.subplot(1, 2, 2)
plt.imshow(euc_dist, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Euclidean Distance')

plt.show()