In [18]:
import os
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import numpy as np

In [2]:
model_name = "ahmed-masry/unichart-base-960"
model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = DonutProcessor.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from PIL import Image

def get_image_embeddings(model, processor, image_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

    with torch.no_grad():
        image_embeddings = model.get_encoder()(pixel_values).last_hidden_state

    return image_embeddings.cpu().numpy().flatten()

image_path = "bmc2.jpeg"
image_embeddings = get_image_embeddings(model, processor, image_path)
print(image_embeddings.shape)

(921600,)


In [17]:
image_folder = "test"
image_files = [
    os.path.join(image_folder, file)
    for file in os.listdir(image_folder)
    if file.endswith((".png", ".jpg", ".gif"))
]

# image_files = ["imgs/10335933_Fig5.jpg"]
unichart_embeddings = {"figure_name": [], "embedding": []}

for i, image_file in enumerate(image_files[:500]):
    print(f"Processing image {i+1}/{len(image_files)}")
    image_embedding = get_image_embeddings(model, processor, image_file)
    unichart_embeddings["figure_name"].append(image_file)
    unichart_embeddings["embedding"].append(image_embedding)

Processing image 1/3781
Processing image 2/3781
Processing image 3/3781
Processing image 4/3781
Processing image 5/3781
Processing image 6/3781
Processing image 7/3781
Processing image 8/3781
Processing image 9/3781
Processing image 10/3781
Processing image 11/3781
Processing image 12/3781
Processing image 13/3781
Processing image 14/3781
Processing image 15/3781
Processing image 16/3781
Processing image 17/3781
Processing image 18/3781
Processing image 19/3781
Processing image 20/3781
Processing image 21/3781
Processing image 22/3781
Processing image 23/3781
Processing image 24/3781
Processing image 25/3781
Processing image 26/3781
Processing image 27/3781
Processing image 28/3781
Processing image 29/3781
Processing image 30/3781
Processing image 31/3781
Processing image 32/3781
Processing image 33/3781
Processing image 34/3781
Processing image 35/3781
Processing image 36/3781
Processing image 37/3781
Processing image 38/3781
Processing image 39/3781
Processing image 40/3781
Processin

In [19]:
embeddings = np.array(unichart_embeddings["embedding"]).reshape(500, 921600)
names = unichart_embeddings["figure_name"]
paper_ids = [name.split("/")[-1].split("_")[0] for name in names]

In [22]:
from nomic import atlas

dataset = atlas.map_data(
    data=[{"name": name, "paper": paper} for name, paper in zip(names, paper_ids)], id_field="name", embeddings=embeddings
)

[32m2024-04-04 10:55:20.243[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m897[0m - [1mCreating dataset `inventive-pascal`[0m
[32m2024-04-04 10:55:20.533[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m108[0m - [1mUploading data to Atlas.[0m
  0%|          | 0/55 [00:00<?, ?it/s][32m2024-04-04 10:57:00.800[0m | [31m[1mERROR   [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1513[0m - [31m[1mShard upload failed: {"detail":"Upload Error: Max Embedding Dimension Supported is 3072"}[0m
[32m2024-04-04 10:57:02.725[0m | [31m[1mERROR   [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1513[0m - [31m[1mShard upload failed: {"detail":"Upload Error: Max Embedding Dimension Supported is 3072"}[0m
[32m2024-04-04 10:57:05.540[0m | [31m[1mERROR   [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1513[0m - [31m[1mShard upload failed: {"detail":"Upload Error: Max Embedding Dimension Supported is 3072"}[0m
