# Structured output with Multimodal Agents in LangGraph

## Install Libraries

In [None]:
!pip install langchain langchain-groq langgraph langchain-anthropic

## Set up enviornment variables

In [None]:
import os
import getpass

# Optional: Uncomment and set these environment variables to use LangSmith
#os.environ['LANGCHAIN_TRACING_V2'] = 'true'
#os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
#os.environ['LANGCHAIN_API_KEY'] = getpass.getpass('Enter your Langchain API key: ')
#os.environ['LANGCHAIN_PROJECT'] = getpass.getpass('Enter your Langchain project ID: ')
os.environ['GROQ_API_KEY'] = getpass.getpass('Enter your Groq API key: ')
os.environ['ANTHROPIC_API_KEY'] = getpass.getpass('Enter your Anthropic API key: ')

## Set up the LLMs

In [None]:
from langchain_groq import ChatGroq
from langchain_anthropic import ChatAnthropic

# We will set streaming=True so that we can stream tokens
# See the streaming section for more information on this.
llama_70b = ChatGroq(model="llama3-groq-70b-8192-tool-use-preview", temperature=0.0)
claude_haiku = ChatAnthropic(model="claude-3-haiku-20240307", temperature=0.0)

## Define Objects

In [None]:
from typing import Sequence, TypedDict
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import HumanMessage, SystemMessage

class ImageDataPoints(BaseModel):
    colors: Sequence[str] = Field(description="The colors present in the image")
    text: Sequence[str] = Field(description="The text present in the image")
    objects: Sequence[str] = Field(description="The objects present in the image")
    vibes: Sequence[str] = Field(description="The vibes present in the image")

class AgentReasoning(BaseModel):
    reasoning: str = Field(description="The reasoning behind the final answer")
    final_answer: str = Field(description="The final answer of the agent")

class AgentState(TypedDict):
    image_path: str
    image_data: Sequence[ImageDataPoints]
    image_poem: str
    image_poem_reasoning: str


## Create Base64 helper function

In [None]:
import base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return encoded_string.decode('utf-8')

## Construct the Agent Graph

In [None]:
from langgraph.graph import END, StateGraph, START

# Define a new graph
workflow = StateGraph(AgentState)

# Define the summarize image function
def extract_data(state: AgentState):
    image_data = encode_image(state['image_path'])
    messages = [
        SystemMessage("You are an expert algorithm in unstructured data extraction from images."),
        HumanMessage(
            content=[
                {"type": "text", "text": """Please extract the following data points from the image:
                - colors: the colors present in the image
                - text: the text present in the image
                - objects: the objects present in the image
                - vibes: the vibes present in the image

                 If any of them are not present, please fill them with an empty list.
                """},
                {"type": "image", "source" : {"type": "base64", "media_type": "image/jpeg", "data": image_data}}
            ]
        )
    ]

    structured_claude = claude_haiku.with_structured_output(ImageDataPoints)

    result = structured_claude.invoke(messages)

    state['image_data'] = result

    return state

def generate_poem(state: AgentState):
    messages = [
        SystemMessage("You are an expert in generating poems."),
        HumanMessage(
            content=[
                {"type": "text", "text": f"""
                Please generate a haiku poem based on the extracted data points from the image.
                The extracted data points are:
                    - Colors: {state['image_data'].colors}
                    - Text: {state['image_data'].text}
                    - Objects: {state['image_data'].objects}
                    - Vibes: {state['image_data'].vibes}

                Reason about the extracted data points first, take your time. Once you have a clear understanding, generate an elegant, well crafted, and meaningful haiku poem.
                """}
            ]
        )
    ]

    structured_llama = llama_70b.with_structured_output(AgentReasoning)

    result = structured_llama.invoke(messages)

    state['image_poem'] = result.final_answer
    state['image_poem_reasoning'] = result.reasoning

    return state

workflow.add_node("extract_data", extract_data)
workflow.add_node("generate_poem", generate_poem)

workflow.add_edge(START, "extract_data")
workflow.add_edge("extract_data", "generate_poem")
workflow.add_edge("generate_poem", END)


# Finally, we compile it!
# This compiles it into a LangChain Runnable,
# meaning you can use it as you would any other runnable
app = workflow.compile()

## Show the Graph

In [None]:
from IPython.display import Image, display

try:
    display(Image(app.get_graph(xray=True).draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

## Test on one image

In [None]:
# Get the first image
import os

image_path = os.path.join(os.getcwd(), "images/18564-36081-51951.jpg")
inputs = {"image_path": image_path}

In [None]:
# Display the image
display(Image(filename=inputs["image_path"], width=200))

In [None]:
for output in app.stream(inputs):
    # stream() yields dictionaries with output keyed by node name
    for key, value in output.items():
        print(f"Output from node '{key}':")
        print("---")
        print(value)
    print("\n---\n")

## Scaling: Test on multiple images

In [None]:
# Run it on all images in the directory
import os

image_dir = os.path.join(os.getcwd(), "images")
image_files = os.listdir(image_dir)

results=[]
for image_file in image_files:
    inputs = {"image_path": os.path.join(image_dir, image_file)}

    # Display the image
    display(Image(filename=inputs["image_path"], width=200))
    for output in app.stream(inputs):
        # stream() yields dictionaries with output keyed by node name
        for key, value in output.items():
            print(f"Output from node '{key}':")
            print("---")
            print(value)
            if key == "generate_poem":
                results.append({value['image_path']: [value['image_poem'], value['image_poem_reasoning'], value['image_data']]})
        print("\n---\n")

In [None]:
print()
for result in results:
    print(f"Image: {list(result.keys())[0]}")
    print("="*len(f"Image: {list(result.keys())[0]}"))
    print(f"Poem: {list(result.values())[0][0]}")
    print("+" * 50)
    print(f"Reasoning: {list(result.values())[0][1]}")
    print("+" * 50)
    print(f"Data: {list(result.values())[0][2]}")
    print("="*len(f"Image: {list(result.keys())[0]}"))
    print()