In [1]:
import re
import hashlib
import os
import torch
from huggingface_hub import snapshot_download
from PIL import Image

from moondream_model import VisionEncoder, TextModel

model_path = snapshot_download("vikhyatk/moondream1")

DEVICE = "cuda"
DTYPE = torch.bfloat16

vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)

print(f"Using:{DEVICE}")
print(f"Type: {DTYPE}")

def cached_vision_encoder(image):
    image_hash = hashlib.sha256(image.tobytes()).hexdigest()
    cache_path = f"image_encoder_cache/{image_hash}.pt"
    if os.path.exists(cache_path):
        return torch.load(cache_path).to(DEVICE, dtype=DTYPE)
    else:
        image_vec = vision_encoder(image).to("cpu", dtype=torch.float16)
        os.makedirs("image_encoder_cache", exist_ok=True)
        torch.save(image_vec, cache_path)
        return image_vec.to(DEVICE, dtype=DTYPE)

def answer_question(image, question):

    with torch.inference_mode():
        image_embeds = cached_vision_encoder(image)
        result = text_model.answer_question(image_embeds, question)

    # Check if the result is a tuple and extract the text part
    if isinstance(result, tuple):
        result_text = result[0]
    else:
        result_text = result

    # Convert the result to string if it's not already
    if not isinstance(result_text, str):
        if torch.is_tensor(result_text):
            result_text = result_text.cpu().numpy().tolist()
            result_text = ' '.join(map(str, result_text))
        else:
            result_text = str(result_text)

    # Apply regex to clean up the result string
    cleaned_result = re.sub("<$", "", re.sub("END$", "", result_text))
    return cleaned_result



def moondream(img, prompt):
    result = answer_question(img, prompt)
    return result


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using:cuda
Type: torch.bfloat16


In [3]:
# Usage example
img = Image.open("img/output_000017.jpg")
prompt = "Describe this image."
result = moondream(img, prompt)
print(result)

Generate method took 1.9480829238891602 seconds.
The image features a young girl with long blonde hair, wearing a purple dress, standing in a room. She is holding a blow dryer in her hand, possibly getting ready to style her hair. The girl appears to be in a bathroom setting.

In the room, there is a chair located to the left side of the girl, and a bottle can be seen placed on a surface nearby. The overall atmosphere seems to be casual and comfortable, as the girl is engaged in her hair care routine.
