In [3]:
import base64

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string

# Example usage
image_path = "thumbnail.jpg"
base64_image = encode_image_to_base64(image_path)

In [4]:
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

@tool
def get_negative_points(image_path):
    """Useful for identifying negative points in an image that could cause it to receive a bad CTR."""
    llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1028)
    image = encode_image_to_base64(image_path)
    return llm.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": "Identify the negative points in this image that could cause it to receive a bad CTR."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image}"},
                    },
                ]
            )
        ]
    ).content

In [5]:
@tool
def get_positive_points(image_path):
    """Useful for identifying positive points in an image that could cause it to receive a good CTR."""
    llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1028)
    image = encode_image_to_base64(image_path)
    return llm.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": "Identify the positive points in this image that could cause it to receive a good CTR."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image}"},
                    },
                ]
            )
        ]
    ).content

In [6]:
tools = [get_negative_points, get_positive_points]

In [7]:
from langchain import hub
from langchain.agents import create_openai_functions_agent
from langchain_openai.chat_models import ChatOpenAI

# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", streaming=True)

# Construct the OpenAI Functions agent
agent_runnable = create_openai_functions_agent(llm, tools, prompt)

In [8]:
from typing import TypedDict, Annotated, List, Union
from langchain_core.agents import AgentAction, AgentFinish
from langchain_core.messages import BaseMessage
import operator
import base64
class AgentState(TypedDict):
    input: str
    image_path: str
    chat_history: list[BaseMessage]
    agent_outcome: Union[AgentAction, AgentFinish, None]
    intermediate_steps: Annotated[list[tuple[AgentAction, str]], operator.add]

In [9]:
from langgraph.prebuilt.tool_executor import ToolExecutor

tool_executor = ToolExecutor(tools)

def run_agent(data):
    agent_outcome = agent_runnable.invoke(data)
    return {"agent_outcome": agent_outcome}

def execute_tools(data):
    agent_action = data["agent_outcome"]
    agent_action.tool_input = data["image_path"]
    output = tool_executor.invoke(agent_action)
    return {"intermediate_steps": [(agent_action, output)]}

def should_continue(data):
    if isinstance(data["agent_outcome"], AgentFinish):
        return "end"
    else:
        return "continue"

In [10]:
from langgraph.graph import END, StateGraph

workflow = StateGraph(AgentState)

workflow.add_node("agent", run_agent)
workflow.add_node("action", execute_tools)

workflow.set_entry_point("agent")

workflow.add_conditional_edges(
    "agent",
    should_continue,
    {
        "continue": "action",
        "end": END,
    },
)

workflow.add_edge("action", "agent")
app = workflow.compile()

In [26]:
inputs = {"input": "Identify the positive and negative points in this image that may be important in predicting thumbnail CTR.", "image_path": "./thumbnail.jpg", "chat_history": []}
for s in app.stream(inputs):
    print(list(s.values())[0])
    print("----")

TypeError: 'async_generator' object is not iterable

In [32]:
result = app.invoke(inputs)

In [33]:
print(result['agent_outcome'].return_values['output']) 

In the provided image, the negative points that may impact the thumbnail CTR include:

1. Violent content featuring a firearm being discharged and violent imagery.
2. Brand association with a well-known brand like Coca-Cola in conjunction with a violent act.
3. Lack of context, potentially leading to confusion for viewers.
4. Audience sensitivity to content featuring weapons and violence.
5. Potential violation of platform policies regarding the depiction of weapons and violence.

On the positive side, the image contains elements that could lead to a high CTR:

1. High-impact visuals with a dynamic composition and bright colors.
2. Emotional expression and intrigue through the person's expressive face.
3. Large, easily readable text with strong color contrast.
4. Brand recognition and potential for evoking curiosity or emotional response.
5. Elements of mystery or suspense that may prompt viewers to click through.

It's important to consider the balance between these positive and negat