<a href="https://colab.research.google.com/github/colesmcintosh/smol-vision/blob/main/smol_vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langgraph langchain_ollama langchain ollama

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import TypedDict

llama_8b = ChatOllama(model="llama3.1:8b")

class ImageSchema(BaseModel):
    creative_title: str = Field(description="The creative title of the image")
    subject: str = Field(description="The subject of the image")
    colors: list = Field(description="The colors in the image")
    setting: str = Field(description="The setting of the image")

structured_llama = llama_8b.with_structured_output(ImageSchema)

In [None]:
def extract_data(image_description):
    prompt = f"""
    Extract the following data from the image description:
    Creative Title (str): a creative title for the image
    Subject (str): the subject of the image
    Colors (List[str]): the colors in the image
    Setting (str): the setting of the image
    ---
    Image Description: "{image_description}"
    """
    return structured_llama.invoke(prompt)

In [None]:
import ollama

def describe_image(image_path):
	res = ollama.chat(
		model="moondream",
		messages=[
			{
				'role': 'user',
				'content': """Describe the image in as much detail as possible."""
			},
			{
				'images': [image_path]
			}
		]
	)

	return res['message']['content']

In [None]:
class AgentState(TypedDict):
    image_path: str
    image_description: str
    image_data: ImageSchema

In [None]:
from langgraph.graph import END, StateGraph, START

# Define a new graph
workflow = StateGraph(AgentState)

def generate_description(state):
    state['image_description'] = describe_image(state['image_path'])
    return state

def extract_data(state):
    state['image_data'] = structured_llama.invoke(state['image_description'])
    return state

workflow.add_node('generate_description', generate_description)
workflow.add_node('extract_data', extract_data)

workflow.add_edge(START, 'generate_description')
workflow.add_edge('generate_description', 'extract_data')
workflow.add_edge('extract_data', END)

agent = workflow.compile()

In [None]:
result = agent.invoke({'image_path': 'sweat_suit_cat.jpeg'})

In [None]:
print(result)

{'image_path': 'images/sweat_suit_cat.jpeg', 'image_description': '\nThe image shows a cat walking down the street in a gray hoodie. The cat appears to be wearing a sweater, giving it an adorable and unique appearance. In the background, there are several people visible on the sidewalk, going about their day. One person is carrying a handbag, while another has a backpack. The scene captures the essence of everyday life in a city setting.', 'image_data': ImageSchema(creative_title='Whiskers on the Move', subject='Cat', colors=['Gray', 'Brown'], setting='City Street')}


In [None]:
print(f"Creative Title: {result['image_data'].creative_title}")
print(f"Subject: {result['image_data'].subject}")
print(f"Colors: {', '.join(result['image_data'].colors)}")
print(f"Setting: {result['image_data'].setting}")

Creative Title: Whiskers on the Move
Subject: Cat
Colors: Gray, Brown
Setting: City Street
