In [3]:
from typing import Annotated, List, Tuple, Union
import os
import base64
from langchain_core.tools import tool

from langchain_openai import ChatOpenAI

@tool
def get_video_description(
    question: str, frames_path: str
    ):
    """Useful for identifying and descripting objects in an sequence of frames. Need a question and the path to the base64 frames."""
    base64Frames = []
    for filename in os.listdir(frames_path):
        with open(os.path.join(frames_path, filename), "rb") as image_file:
            base64Frames.append(base64.b64encode(image_file.read()).decode("utf-8"))
    # select 8 frames equally spaced
    base64Frames = base64Frames[:: len(base64Frames) // 8]
    llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1028)
    PROMPT_MESSAGES = [
        {
            "role": "system",
            "content": "You are provided with a sequence of frames from a video in base64 format. Your task is to analyze that sequence and answer questions related to them.",
        },
        {
            "role": "user",
            "content": [
                question,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames),
            ],
        },
    ]
    response = llm.invoke(PROMPT_MESSAGES)
    return response

In [2]:
print(get_video_description(question="Describe this video", frames_path="./analyst1_frames"))

content="This video shows a person cutting a pizza. The pizza is on a wooden board, and the person is wearing black clothing with a logo on the chest (though the full logo isn't visible) and white gloves. They are using a pizza cutter with a black handle to slice the pizza. The pizza has a golden-brown crust, cheese, and what appears to be red meat toppings, possibly pepperoni or sausage.\n\nThe sequence of frames suggests that the person is cutting the pizza into slices. The cutter is first positioned at the edge of the pizza, then pressed down through the pizza to cut it, and then lifted away after completing the cut. The person's hand placement and the movement of the pizza cutter indicate the action of slicing the pizza. The final frame shows the pizza with visible cut lines, indicating the slicing has been completed or is in progress."


In [4]:
tools = [get_video_description]

In [5]:
import json

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    ChatMessage,
    FunctionMessage,
    HumanMessage,
)
from langchain.tools.render import format_tool_to_openai_function
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import END, StateGraph
from langgraph.prebuilt.tool_executor import ToolExecutor, ToolInvocation


def create_agent(llm, tools, system_message: str):
    """Create an agent."""
    functions = [format_tool_to_openai_function(t) for t in tools]

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful AI assistant that answers questions about a video, collaborating with other assistants."
                " Use the provided tools to see the video and progress towards answering the question."
                " If you are unable to fully answer, that's OK, another assistant with different knowledge of the video"
                " will help where you left off. Execute what you can to make progress."
                " If you or any of the other assistants have the final answer or deliverable,"
                " prefix your response with FINAL ANSWER so the team knows to stop."
                " You have access to the following tools: {tool_names}.\n{system_message}",
            ),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    prompt = prompt.partial(system_message=system_message)
    prompt = prompt.partial(tool_names=", ".join([tool.name for tool in tools]))
    return prompt | llm.bind_functions(functions)

In [6]:
import operator
from typing import Annotated, List, Sequence, Tuple, TypedDict, Union

from langchain.agents import create_openai_functions_agent
from langchain.tools.render import format_tool_to_openai_function
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain_openai import ChatOpenAI
from typing_extensions import TypedDict


# This defines the object that is passed between each node
# in the graph. We will create different nodes for each agent and tool
class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    path_frames_1: str
    path_frames_2: str
    sender: str

In [7]:
import functools


# Helper function to create a node for a given agent
def agent_node(state, agent, name):
    result = agent.invoke(state)
    # We convert the agent output into a format that is suitable to append to the global state
    if isinstance(result, FunctionMessage):
        pass
    else:
        result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
    return {
        "messages": [result],
        # Since we have a strict workflow, we can
        # track the sender so we know who to pass to next.
        "sender": name,
    }


llm = ChatOpenAI(model="gpt-4-1106-preview")


In [53]:
agent_1 = create_agent(
    llm,
    tools,
    system_message="You have access to the first part of the movie. Wait for the second agent to provide the second part."
)
agent_1_node = functools.partial(agent_node, agent=agent_1, name="agent_1")

agent_2 = create_agent(
    llm,
    tools,
    system_message="You have access to the second part of the movie. Wait for the first agent to provide the first part."
)
agent_2_node = functools.partial(agent_node, agent=agent_2, name="agent_2")

In [54]:
tool_executor = ToolExecutor(tools)

In [55]:
def tool_node(state):
    messages = state["messages"]
    last_message = messages[-1]
    # print(last_message.name)
    tool_input = json.loads(
        last_message.additional_kwargs["function_call"]["arguments"]
    )
    if len(tool_input) == 1 and "__arg1" in tool_input:
        tool_input = next(iter(tool_input.values()))
    tool_name = last_message.additional_kwargs["function_call"]["name"]
    if last_message.name == "agent_1":
        tool_input["frames_path"] = state["path_frames_1"]
    elif last_message.name == "agent_2":
        tool_input["frames_path"] = state["path_frames_2"]
    
    action = ToolInvocation(
        tool = tool_name,
        tool_input = tool_input,
    )
    response = tool_executor.invoke(action)
    function_message = FunctionMessage(
        content=f"{tool_name} response: {str(response)}", name=action.tool
    )
    return{"messages": [function_message]}

In [56]:
def router(state):
    # This is the router
    messages = state["messages"]
    last_message = messages[-1]
    if "function_call" in last_message.additional_kwargs:
        # The previus agent is invoking a tool
        return "call_tool"
    if "FINAL ANSWER" in last_message.content:
        # Any agent decided the work is done
        return "end"
    return "continue"

In [57]:
workflow = StateGraph(AgentState)

workflow.add_node("agent_1", agent_1_node)
workflow.add_node("agent_2", agent_2_node)
workflow.add_node("call_tool", tool_node)

workflow.add_conditional_edges(
    "agent_1",
    router,
    {"continue": "agent_2", "call_tool": "call_tool", "end": END},
)
workflow.add_conditional_edges(
    "agent_2",
    router,
    {"continue": "agent_1", "call_tool": "call_tool", "end": END},
)
workflow.add_conditional_edges(
    "call_tool",
    lambda x: x["sender"],
    {"agent_1": "agent_1", "agent_2": "agent_2"},
)

workflow.set_entry_point("agent_1")
graph = workflow.compile()

In [58]:
for s in graph.stream(
    {
        "messages": [
            HumanMessage(
                content="Describe the video in detail."
            )
        ],
        "path_frames_1": "./analyst1_frames",
        "path_frames_2": "./analyst2_frames",
    },
    # Maximum number of steps to take in the graph
    {"recursion_limit": 150},
):
    print(s)
    print("----")


{'agent_1': {'messages': [HumanMessage(content='', additional_kwargs={'function_call': {'arguments': '{"question":"Describe the video in detail.","frames_path":"frames1"}', 'name': 'get_video_description'}}, name='agent_1')], 'sender': 'agent_1'}}
----
{'call_tool': {'messages': [FunctionMessage(content='get_video_description response: content="The sequence of images shows a person cutting a pizza into slices. The person is wearing a black t-shirt and white gloves, indicating a focus on hygiene. The pizza is on a wooden board on a table, and the person uses a pizza cutter with a black handle to slice the pizza.\\n\\nHere\'s a more detailed breakdown:\\n\\n1. In the first image, the person has just started to cut into the pizza, with their hands positioned to make a slice.\\n2. The second image captures the pizza cutter in motion, midway through slicing.\\n3. The third image shows the person completing the cut, with one hand holding the pizza in place.\\n4. In the fourth image, the pizz

In [61]:
content="FINAL ANSWER\n\nThe video comprises two parts, detailing the process of preparing and garnishing a pizza. In the first part, an individual is shown cutting a pizza into slices. The person is wearing a black t-shirt and white gloves, highlighting hygiene. A pizza cutter with a black handle is used to slice the pizza placed on a wooden board on a table. The sequence suggests the methodical cutting of the pizza into equal slices, potentially as part of a looped video or demonstrating repetitive actions during the slicing process.\n\nIn the second part, the same person, now donning a black apron along with the white gloves, is garnishing a cooked pizza with melted cheese and toppings. The pizza is on the same wooden cutting board, with the pizza cutter positioned to the left. Green toppings, which could be herbs or green onions, are being evenly distributed by hand over the pizza. The focus remains on the pizza and the garnishing activity, as the person's face is not shown. The kitchen setting is simple and well-lit by natural light, likely from the right side of the frame.\n\nOverall, the video portrays the final steps of pizza preparation, from cutting to garnishing, in a clean and methodical fashion before the pizza is served."
print(content)

FINAL ANSWER

The video comprises two parts, detailing the process of preparing and garnishing a pizza. In the first part, an individual is shown cutting a pizza into slices. The person is wearing a black t-shirt and white gloves, highlighting hygiene. A pizza cutter with a black handle is used to slice the pizza placed on a wooden board on a table. The sequence suggests the methodical cutting of the pizza into equal slices, potentially as part of a looped video or demonstrating repetitive actions during the slicing process.

In the second part, the same person, now donning a black apron along with the white gloves, is garnishing a cooked pizza with melted cheese and toppings. The pizza is on the same wooden cutting board, with the pizza cutter positioned to the left. Green toppings, which could be herbs or green onions, are being evenly distributed by hand over the pizza. The focus remains on the pizza and the garnishing activity, as the person's face is not shown. The kitchen setting