In [1]:
from PIL import Image

from transformers import CLIPProcessor, CLIPModel
import os
import polars as pl
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [6]:
import torch
from PIL import Image

def get_image_embedding(image_path, model, processor):
    """
    Returns the image embedding for a given image path using the CLIP model.
    
    Args:
        image_path (str): Path to the image file.
        model (CLIPModel): The pretrained CLIP model.
        processor (CLIPProcessor): The CLIP processor.
        
    Returns:
        torch.Tensor: The image embedding.
    """
    # Open the image
    image = Image.open(image_path)
    
    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt")
    
    # Get the image embedding
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    
    return outputs

In [7]:
image_path = "bmc2.jpeg"
image_embedding = get_image_embedding(image_path, model, processor)
print(image_embedding.shape)  # Output: torch.Size([768])

torch.Size([1, 512])


In [19]:
image_folder = "test"
image_files = [
    os.path.join(image_folder, file)
    for file in os.listdir(image_folder)
    if file.endswith((".png", ".jpg", ".gif"))
]

# image_files = ["imgs/10335933_Fig5.jpg"]
img_data = pl.read_parquet("figure_data.parquet")
clip_embeddings = {"figure_name": [], "embedding": []}

for i, image_file in enumerate(image_files):
    print(f"Processing image {i+1}/{len(image_files)}")
    image_embedding = get_image_embedding(image_file, model, processor)
    clip_embeddings["figure_name"].append(image_file)
    clip_embeddings["embedding"].append(image_embedding.numpy())

Processing image 1/3781
Processing image 2/3781
Processing image 3/3781
Processing image 4/3781
Processing image 5/3781
Processing image 6/3781
Processing image 7/3781
Processing image 8/3781
Processing image 9/3781
Processing image 10/3781
Processing image 11/3781
Processing image 12/3781
Processing image 13/3781
Processing image 14/3781
Processing image 15/3781
Processing image 16/3781
Processing image 17/3781
Processing image 18/3781
Processing image 19/3781
Processing image 20/3781
Processing image 21/3781
Processing image 22/3781
Processing image 23/3781
Processing image 24/3781
Processing image 25/3781
Processing image 26/3781
Processing image 27/3781
Processing image 28/3781
Processing image 29/3781
Processing image 30/3781
Processing image 31/3781
Processing image 32/3781
Processing image 33/3781
Processing image 34/3781
Processing image 35/3781
Processing image 36/3781
Processing image 37/3781
Processing image 38/3781
Processing image 39/3781
Processing image 40/3781
Processin

In [25]:
embeddings = np.array(clip_embeddings["embedding"]).reshape(3781, 512)
names = clip_embeddings["figure_name"]
paper_ids = [name.split("/")[-1].split("_")[0] for name in names]

In [36]:
from nomic import atlas
import numpy as np

dataset = atlas.map_data(
    data=[{"name": name, "paper": paper} for name, paper in zip(names, paper_ids)], id_field="name", embeddings=embeddings
)

[32m2024-04-04 10:27:52.461[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m897[0m - [1mCreating dataset `observant-borgonio`[0m
[32m2024-04-04 10:27:52.797[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m108[0m - [1mUploading data to Atlas.[0m
1it [00:03,  3.89s/it]
[32m2024-04-04 10:27:56.761[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1567[0m - [1mUpload succeeded.[0m
[32m2024-04-04 10:27:56.775[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m123[0m - [1m`quackmires/observant-borgonio`: Data upload succeeded to dataset`[0m
[32m2024-04-04 10:27:58.015[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1276[0m - [1mCreated map `observant-borgonio` in dataset `quackmires/observant-borgonio`: https://atlas.nomic.ai/data/quackmires/observant-borgonio/map[0m


In [35]:
paper_ids = [name.split("/")[-1].split("_")[0] for name in names]

In [20]:
embeddings.shape

(10000, 256)