In [1]:
import io
from PIL import Image
import textgrad as tg
from textgrad.autograd import MultimodalLLMCall
from textgrad.loss import ImageQALoss


import textgrad as tg
from textgrad import get_engine, set_backward_engine

MODEL_NAME = "ollama/Qwen2.5"

engine = get_engine(f"experimental:{MODEL_NAME}", cache=False)
# this also works with
set_backward_engine(f"experimental:{MODEL_NAME}", cache=False, override=True)

In [None]:
import httpx
import litellm
from textgrad.engine_experimental.litellm import LiteLLMEngine

litellm._turn_on_debug()


LiteLLMEngine(MODEL_NAME, cache=True).generate(content="hello, what's 3+4", system_prompt="you are an assistant")

image_url = "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"
image_data = httpx.get(image_url).content

LiteLLMEngine(MODEL_NAME, cache=True).generate(content=[image_data, "what is this my boy"], system_prompt="you are an assistant")
tg.set_backward_engine("gpt-4o")



In [6]:
image_path = "bee.jpg"
# Read the local image file in binary mode
with open(image_path, 'rb') as file:
    image_data = file.read()

# Print the first few bytes of the image data to verify (optional)
print(image_data[:10])

FileNotFoundError: [Errno 2] No such file or directory: 'bee.jpg'

In [5]:
image_variable = tg.Variable(image_data, role_description="image to answer a question about", requires_grad=False)

In [6]:
question_variable = tg.Variable("What do you see in this image?", role_description="question", requires_grad=False)
response = MultimodalLLMCall("gpt-4o")([image_variable, question_variable])
response

Variable(value=This image shows a close-up of a honeybee collecting pollen. The bee is perched on a cluster of flowers, and you can see pollen attached to its hind legs. The details of the bee's body, including its wings, eyes, and fuzzy thorax, are clearly visible., role=response from the language model, grads=set())

In [7]:
loss_fn = ImageQALoss(
    evaluation_instruction="Does this seem like a complete and good answer for the image? Criticize. Do not provide a new answer.",
    engine="gpt-4o"
)
loss = loss_fn(question=question_variable, image=image_variable, response=response)
loss

Variable(value=The answer is mostly complete and accurate. It correctly identifies the main elements of the image: a honeybee collecting pollen, the presence of flowers, and the details of the bee's body. However, it could be improved by mentioning the specific type of flowers or plant if identifiable, and by describing the setting or background to provide more context. Additionally, it could note the color and texture details of the bee and flowers for a more vivid description., role=evaluation of the response from the language model, grads=set())

In [8]:
optimizer = tg.TGD(parameters=[response])
loss.backward()
optimizer.step()
print(response.value)

This image shows a close-up of a honeybee collecting pollen. The bee is perched on a cluster of flowers, possibly from a plant like goldenrod, with pollen visibly attached to its hind legs. The details of the bee's body, including its translucent wings, large compound eyes, and fuzzy thorax, are clearly visible. The bee's distinctive black and yellow stripes add to its vibrant appearance. The background appears to be a natural setting, with earthy tones that complement the scene. This moment captures the bee's vital role in pollination, as it diligently gathers pollen, contributing to the ecosystem. The gentle hum of its wings adds a sense of life and movement to the image.
