In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch

# Load the latest Moondream2 model
model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    revision="2025-06-21",
    trust_remote_code=True,
    device_map="auto" if torch.cuda.is_available() else None
)

tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)


In [2]:
# Load your dataset
dataset = load_dataset("Adi-0-0-Gupta/Eyewear-Dataset-1024")



In [3]:
# dataset = dataset['train'].select(range(100))

In [4]:


def caption_batch(batch):
    """
    Function to caption a batch of images from the dataset
    """
    captions = []
    
    for image in batch["image"]:  # Process each image in the batch
        try:
            # If image is a path string, load it
            if isinstance(image, str):
                image = Image.open(image)
                image = model.encode(image)
            
            # Generate caption using Moondream2
            caption_result = model.caption(image, length="normal")
            captions.append(caption_result["caption"])
            
        except Exception as e:
            # Handle any errors gracefully
            captions.append(f"Error generating caption: {str(e)}")
    
    # Return the batch with new caption column
    batch["caption"] = captions
    return batch

# Apply the captioning function using batched map
dataset_with_captions = dataset.map(
    caption_batch,
    batched=True,           # Enable batch processing
    batch_size=20,           # Process 8 images at a time (adjust based on your GPU memory)
    num_proc=1,             # Use single process to avoid GPU memory conflicts
    desc="Adding captions"
)

print("Dataset with captions:")
print(dataset_with_captions)
print("\nSample caption:")
print(dataset_with_captions[0]["caption"])


Adding captions:   0%|          | 0/20964 [00:00<?, ? examples/s]

Dataset with captions:
DatasetDict({
    train: Dataset({
        features: ['brand', 'prompt', 'product_type', 'image', 'control_image', 'caption'],
        num_rows: 20964
    })
})

Sample caption:


KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['train']"

In [7]:
dataset_with_captions['train'][0]

{'brand': 'Polaroid',
 'prompt': 'shape is round / oval, technology is r, frame material is polycarbonate,.',
 'product_type': 'sunglasses',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024>,
 'control_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024>,
 'caption': 'A close-up view of a pair of Polaroid sunglasses. The frame is a dark brown color with a tortoiseshell pattern and a slightly curved shape. The clear lenses reflect a blue tint. The right arm of the sunglasses is slightly extended, showcasing the sleek design. The brand name "Polaroid" is prominently displayed on the right arm of the frame. Two small white dots are visible on the left arm, possibly indicating fasteners. The sunglasses are set against a stark white background, which highlights their features and design.'}

In [8]:
dataset_with_captions.push_to_hub("dnth/Eyewear-Dataset-1024-with-captions")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ? shards/s]

Map:   0%|          | 0/4193 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Map:   0%|          | 0/4193 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Map:   0%|          | 0/4193 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Map:   0%|          | 0/4193 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Map:   0%|          | 0/4192 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dnth/Eyewear-Dataset-1024-with-captions/commit/66a3b52eb9ea141d7408e73a9cada86ae5775e9a', commit_message='Upload dataset', commit_description='', oid='66a3b52eb9ea141d7408e73a9cada86ae5775e9a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dnth/Eyewear-Dataset-1024-with-captions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dnth/Eyewear-Dataset-1024-with-captions'), pr_revision=None, pr_num=None)