In [5]:
!pip install transformers timm einops



In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

model_id = "vikhyatk/moondream2"
revision = "2024-08-26"
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

**1. Image Captioning (Image Description)**

In [2]:
from PIL import Image
image = Image.open('street.jpg')

enc_image = model.encode_image(image)
input = "Describe this image."
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")


Input: Describe this image.
Output: The image captures a bustling street scene in London, featuring iconic red double-decker buses and the famous Big Ben clock tower in the background.


**2. Visual Question-Answering (Visual Conversation)**

In [4]:
image = Image.open('cats.jpg')
enc_image = model.encode_image(image)
input = "How many cats the girl is holding?"
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")
input = "What is their color?"
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")


Input: How many cats the girl is holding?
Output: The girl is holding two cats.

Input: What is their color?
Output: The kittens are brown and white in color.


**3. Visual Knowledge Reasoning**

In [5]:
image = Image.open('tajmahal.jpg')
enc_image = model.encode_image(image)
input = "Tell about the history of this place"
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")


Input: Tell about the history of this place
Output: The image shows the Taj Mahal, a famous white marble mausoleum located in Agra, India. It was built by the Mughal Emperor Shah Jahan in memory of his wife Mumtaz Mahal. The Taj Mahal is an iconic symbol of love and architectural prowess, and it has become a popular tourist destination. The mausoleum is surrounded by a large garden with a reflecting pool, which adds to its beauty and grandeur. The image captures the Taj Mahal from a distance, showcasing its impressive size and intricate design.


**4. Visual Contextual Understanding**

In [6]:
image = Image.open('boy_playing_with_pet.jpg')
enc_image = model.encode_image(image)
input = "How does the boy feel and why?"
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")


Input: How does the boy feel and why?
Output: The boy feels happy and content because he is enjoying a fun and engaging activity with his dog. In the image, the boy is running and playing with a yellow dog, throwing a ball for it to fetch. This activity allows the boy to bond with his dog, exercise, and have a good time. Playing with a dog is known to release endorphins, which are chemicals in the brain that contribute to feelings of happiness and well-being. The boy's smile and the dog's excitement indicate that they are both enjoying the moment and having a positive experience together.


**5. Text Recognition**

In [7]:
image = Image.open('written_quote.jpg')
enc_image = model.encode_image(image)
input = "What's written on this piece of paper?"
output = model.answer_question(enc_image, input, tokenizer)
print(f"\nInput: {input}\nOutput: {output}")


Input: What's written on this piece of paper?
Output: The handwritten quote says, "The future depends on what you do today. Mahatma Gandhi."
