# How to pass multimodal data directly to models

source
- https://python.langchain.com/v0.2/docs/how_to/multimodal_inputs/

In [1]:
# 모델이 설명할 이미지 링크
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

In [2]:
pip install langchain langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "How to pass multimodal data directly to models"

In [4]:
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o")

In [5]:
import base64

import httpx


# 일반적인 이미지 전달 방법은 바이트 스트링(byte string)으로 전달하는 것이다.
image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")

In [6]:
message = HumanMessage(
    content=[
        {"type": "text", "text": "describe the weather in this image"},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
        },
    ]
)

response = model.invoke([message])
print(response.content)

The weather in the image appears to be clear and pleasant. The sky is mostly blue with some scattered clouds, indicating minimal cloud cover and likely sunny conditions. There is no sign of precipitation, and the lighting suggests that it is daytime, possibly late morning or afternoon. The surrounding greenery looks vibrant and well-lit, further emphasizing the sunny and clear weather.


In [7]:
message = HumanMessage(
    content=[
        {"type": "text", "text": "are these two images the same?"},
        {"type": "image_url", "image_url": {"url": image_url}},
        {"type": "image_url", "image_url": {"url": image_url}},
    ],
)

response = model.invoke([message])
print(response.content)

Yes, these two images are the same. They both depict a wooden pathway through a grassy field with a blue sky and clouds overhead.


# Tool calls

In [8]:
from typing import Literal

from langchain_core.tools import tool

@tool
def weather_tool(weather: Literal["sunny", "cloudy", "rainy"]) -> None:
    """Describe the weather"""
    pass

model_with_tools = model.bind_tools([weather_tool])

message = HumanMessage(
    content=[
        {"type": "text", "text": "describe the weather in this image"},
        {"type": "image_url", "image_url": {"url": image_url}},
    ]
)

response = model_with_tools.invoke([message])
print(response.tool_calls)

[{'name': 'weather_tool', 'args': {'weather': 'sunny'}, 'id': 'call_MMbtVHX0jDlCzRSTNvQd1AOc', 'type': 'tool_call'}]
