In [1]:
from typing import Annotated, List, Tuple, Union
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
import threading
import base64
import os
import cv2

@tool
def get_video_description(
    question: str, video_path: str
    ):
    """Useful for descripting scenes and answer questions from a video. Need a question and the path to the video."""

    vidcap = cv2.VideoCapture(video_path)
    success,image = vidcap.read()
    frames = []
    while success:
        frames.append(image)
        success,image = vidcap.read()
    
    # Selecting 8 frames from the video
    selected_frames = []
    for i in range(0, len(frames), len(frames)//8):
        selected_frames.append(frames[i])
    # Converting the frames to base64
    base64Frames = []
    for frame in selected_frames:
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
    llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1028)
    PROMPT_MESSAGES = [
        {
            "role": "system",
            "content": "You are provided with a sequence of frames from a video in base64 format. Your task is to describe that sequence and answer questions related to them.",
        },
        {
            "role": "user",
            "content": [
                question,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames),
            ],
        },
    ]
    response = llm.invoke(PROMPT_MESSAGES)
    return response

In [2]:
from moviepy.editor import VideoFileClip
import datetime

@tool
def split_video(video_path: str):
    """Split a video into 2 equals parts, save them as mp4 and return the names of the files."""
    
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    # Load the video
    video = VideoFileClip(video_path)

    # Calculate the duration of the video
    duration = video.duration

    # Split the duration to get the midpoint
    midpoint = duration / 2

    # Split the video into two parts
    video.subclip(0, midpoint).write_videofile(f"part1_{timestamp}.mp4", verbose=False, logger=None)
    video.subclip(midpoint, duration).write_videofile(f"part2_{timestamp}.mp4", verbose=False, logger=None)
    return [f"part1_{timestamp}.mp4", f"part2_{timestamp}.mp4"]


In [3]:
from langchain import hub
from langchain.agents import create_openai_functions_agent, AgentExecutor
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.messages import BaseMessage, HumanMessage, FunctionMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_prompt=(
    "Your task is to analyze a video and answer questions from the user. First you must divide it into two equal parts and then using a descriptive tool to analyze each part. Once both parts are analyzed, you need to aggregate the findings from both analyses into a comprehensive summary."
    "Step 1: Divide the Video"
    "You are provided with a video file path."
    "Use the split_video tool to divide this video into two equal parts. The tool is invoked with the command: split_video('example_video.mp4')."
    "The split_video tool will return the path of two files, representing the first and second halves of the video, respectively."
    "Step 2: Analyze Each Part"
    "Once the video is divided, use the get_video_description tool to analyze each part."
    "Invoke the tool for the first part using the command: get_video_description('path_file.mp4')."
    "Repeat the process for the second part with the command: get_video_description('path_file.mp4')."
    "Each call to get_video_description will return a description of the respective video part."
    "Step 3: Aggregate Findings"
    "After receiving the descriptions for both parts, your next task is to aggregate these findings."
    "Combine the key points from each description to form a comprehensive summary and answer the user questions."
    "Focus on highlighting any significant observations, thematic elements, or notable content from each part of the video."
    "Step 4: Final Submission"
    "Compile your aggregated summary into a final report."
    "Ensure that the report is clear and concise, providing a complete understanding of the entire video's content based on your analysis."
    "When the report is ready, submit it with the title: “Comprehensive Analysis of example_video.mp4”."
    "Remember: Accuracy and thoroughness are key in this task. Ensure that you capture all relevant details in your analysis and report."
    "When finished, respond with FINISH."
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        MessagesPlaceholder(variable_name="messages"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", streaming=True)

# Create the tools object
tools = [get_video_description, split_video]

# Construct the OpenAI Functions agent
agent_1 = create_openai_functions_agent(llm, tools, prompt)
agent_1 = AgentExecutor(agent=agent_1, tools=tools)
agent_2 = create_openai_functions_agent(llm, tools, prompt)
agent_2 = AgentExecutor(agent=agent_2, tools=tools)

In [4]:
members = ["Agent_1", "Agent_2"]
system_prompt=(
    "Your task is to oversee the analysis of a video by working with your team members: {members}. The process requires you to first split the video into two parts and then sequentially instruct two different agents to analyze these parts. Follow these steps carefully:"
    "Step 1: Split the Video"
    "Begin by using the split_video tool to divide the video into two equal segments. Execute this with the command: split_video('file_path')."
    "This will result in the creation of two new video files, representing the first and second halves of the original video."
    "Step 2: Instruct Agent_1 and Wait"
    "After splitting the video, send a message to the first agent (Agent_1) containing only the file path of the first video segment and the user's specific question. Format the message as follows: 'Please analyze the video segment. The video file is located at file_path. Your analysis is crucial. - Agent_1'. It's imperative to include nothing else in this message."
    "Once this message is sent, suspend your actions and await further instructions from the router for the next step."
    "Step 3: Instruct Agent_2 and Wait"
    "When prompted again by the router, send a similar message to the second agent (Agent_2) with the file path of the second video segment and the user's question. The message should be formatted as: 'Please analyze the video segment. The video file is located at file_path. Your analysis is crucial. - Agent_2'. Ensure the message contains only this specific information."
    "After dispatching this message, pause your operations and wait for the next directive from the router."
    "Step 4: Compile Findings"
    "Upon reactivation by the router, and once both agents have finished their analysis, compile and synthesize the findings from each video segment."
    "Create a detailed summary that includes essential insights, themes, and significant aspects from both parts of the video."
    "Step 5: Final Report"
    "Assemble these findings into a final report. The report should be concise, clear, and offer an in-depth overview of the video based on the analyses."
    "Name the report 'Comprehensive Analysis of example_video.mp4', ensuring it contains all relevant details from the analyses for accuracy."
    "Conclude your task by replying with FINAL ANSWER."
)

options = members + ["FINAL ANSWER"]

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        MessagesPlaceholder(variable_name="messages"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
).partial(options=str(options), members=", ".join(members))

supervisor = create_openai_functions_agent(llm, [split_video], prompt)
supervisor = AgentExecutor(agent=supervisor, tools=[split_video])

In [5]:
from typing import TypedDict, Annotated, List, Union, Sequence
from langchain_core.agents import AgentAction, AgentFinish
from langchain_core.messages import BaseMessage
import operator

class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    sender: str

In [6]:
import functools


def agent_node(state, agent, name):
    result = agent.invoke(state)
    return {"messages": [HumanMessage(content=result["output"], name=name)]}

agent_1_node = functools.partial(agent_node, agent=agent_1, name="Agent_1")
agent_2_node = functools.partial(agent_node, agent=agent_2, name="Agent_2")
supervisor_node = functools.partial(agent_node, agent=supervisor, name="supervisor")

In [7]:
from langgraph.prebuilt.tool_executor import ToolExecutor, ToolInvocation
import json

tool_executor = ToolExecutor(tools)

def tool_node(state):
    """This runs tools in the graph

    It takes in an agent action and calls that tool and returns the result."""
    messages = state["messages"]
    # Based on the continue condition
    # we know the last message involves a function call
    last_message = messages[-1]
    # We construct an ToolInvocation from the function_call
    tool_input = json.loads(
        last_message.additional_kwargs["function_call"]["arguments"]
    )
    # We can pass single-arg inputs by value
    if len(tool_input) == 1 and "__arg1" in tool_input:
        tool_input = next(iter(tool_input.values()))
    tool_name = last_message.additional_kwargs["function_call"]["name"]
    action = ToolInvocation(
        tool=tool_name,
        tool_input=tool_input,
    )
    # We call the tool_executor and get back a response
    response = tool_executor.invoke(action)
    # We use the response to create a FunctionMessage
    function_message = FunctionMessage(
        content=f"{tool_name} response: {str(response)}", name=action.tool
    )
    # print(f"State: {state}")
    # We return a list, because this will get added to the existing list
    return {"messages": [function_message]}

In [11]:
def router(state):
    # This is the router
    messages = state["messages"]
    last_message = messages[-1]
    print(f"Last message: {last_message}")
    last_sender = last_message.name
    print(f"Last sender: {last_sender}")
    if "function_call" in last_message.additional_kwargs:
        return "call_tool"
    if last_sender == "Agent_1":
        return "finish"
    if last_sender == "Agent_2":
        return "finish"
    if "FINAL ANSWER" in last_message.content and last_sender == "supervisor":
        return "end"
    if "part1" in last_message.content and last_sender == "supervisor":
        return "agent_1"
    if "part2" in last_message.content and last_sender == "supervisor":
        return "agent_2"
    return "continue"

In [12]:
from langgraph.graph import END, StateGraph
from langgraph.checkpoint.sqlite import SqliteSaver

workflow = StateGraph(AgentState)

workflow.add_node("Agent_1", agent_1_node)
workflow.add_node("Agent_2", agent_2_node)
workflow.add_node("supervisor", supervisor_node)
workflow.add_node("call_tool", tool_node)


workflow.add_conditional_edges(
    "Agent_1",
    router,
    {"continue": "Agent_1", "call_tool": "call_tool", "finish": "supervisor"},
)
workflow.add_conditional_edges(
    "Agent_2",
    router,
    {"continue": "Agent_2", "call_tool": "call_tool", "finish": "supervisor"},
)
workflow.add_conditional_edges(
    "supervisor",
    router,
    {"continue": "supervisor", "end": END, "call_tool": "call_tool", "agent_1": "Agent_1", "agent_2": "Agent_2"},
)
workflow.add_conditional_edges(
    "call_tool",
    # Each agent node updates the 'sender' field
    # the tool calling node does not, meaning
    # this edge will route back to the original agent
    # who invoked the tool
    lambda x: x["sender"],
    {
        "Agent_1": "Agent_1",
        "Agent_2": "Agent_2",
        "supervisor": "supervisor",
    },
)
workflow.set_entry_point("supervisor")

memory = SqliteSaver.from_conn_string(":memory:")
graph = workflow.compile(checkpointer=memory)

In [13]:
for s in graph.stream(
    {
        "messages": [
            HumanMessage(content="Write a brief report for the video. Its path is hospital.mp4.", name="user")
        ],
    },
    {"configurable":{"thread_id": "2"}},
):
    if "__end__" not in s:
        print(s)
        print("----")

{'supervisor': {'messages': [HumanMessage(content='Please analyze the video segment. The video file is located at part1_20240216_195605.mp4. Your analysis is crucial. - Agent_1', name='supervisor')]}}
----
Last message: content='Please analyze the video segment. The video file is located at part1_20240216_195605.mp4. Your analysis is crucial. - Agent_1' name='supervisor'
Last sender: supervisor
{'Agent_1': {'messages': [HumanMessage(content='The video "hospital.mp4" has been split into two parts for analysis.\n\nFirst Part Analysis:\nThe first part of the video depicts a vintage sequence likely showing a scientific or medical procedure. It features a woman undergoing respiratory or speech testing, followed by her use of a "ARMY EMERGENCY RESUSCITATOR," indicating a military context. The video also shows a storage area for chemicals or medical supplies. Overall, it appears to be related to medical or emergency preparedness, possibly demonstrating the use of equipment designed for resusc

In [None]:
graph.invoke(
    {
        "messages": [
            HumanMessage(content="Write a brief report for the video. Its path is hospital.mp4.", name="user")
        ],
    },
    {"recursion_limit": 100},
)