In [1]:
from typing import Annotated, List, Tuple, Union
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
import threading
import base64
import os

@tool
def get_video_description(
    question: str, path: str
    ):
    """Useful for identifying and descripting objects in an sequence of frames. Need a question and the path to the frames."""
    base64Frames = []
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), "rb") as image_file:
            base64Frames.append(base64.b64encode(image_file.read()).decode("utf-8"))
    llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1028)
    PROMPT_MESSAGES = [
        {
            "role": "system",
            "content": "You are provided with a sequence of frames from a video in base64 format. Your task is to analyze that sequence and answer questions related to them.",
        },
        {
            "role": "user",
            "content": [
                question,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
            ],
        },
    ]
    response = llm.invoke(PROMPT_MESSAGES)
    return response

In [2]:
import cv2
import os

@tool
def split_video(
    video_path: str
    ):
    """Split a video into 2 sets of frames, save them to analysts' folders and return the names of the folders."""
    print("success")
    vidcap = cv2.VideoCapture(video_path)
    success,image = vidcap.read()
    count = 0
    frames = []
    # create the folders for the analysts
    os.makedirs("analyst1_frames", exist_ok=True)
    os.makedirs("analyst2_frames", exist_ok=True)
    while success:
        frames.append(image)
        success,image = vidcap.read()
        count += 1
    # Divide the frames into 2 sets
    frames1 = frames[:len(frames)//2]
    frames2 = frames[len(frames)//2:]
    # Save the frames to analysts' folders
    for i, frame in enumerate(frames1):
        cv2.imwrite(f"analyst1_frames/{i}.jpg", frame)
    for i, frame in enumerate(frames2):
        cv2.imwrite(f"analyst2_frames/{i}.jpg", frame)
    return ["analyst1_frames", "analyst2_frames"]

In [3]:
tools = [get_video_description, split_video]

In [4]:
from langchain import hub
from langchain.agents import create_openai_functions_agent
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_prompt=(
    "Your task is to analyze a video and answer questions from the user. First you must divide it into two equal parts and then using a descriptive tool to analyze each part. Once both parts are analyzed, you need to aggregate the findings from both analyses into a comprehensive summary."
    "Step 1: Divide the Video"
    "You are provided with a video file path."
    "Use the split_video tool to divide this video into two equal parts. The tool is invoked with the command: split_video('example_video.mp4')."
    "The split_video tool will return the path of two folders, representing the first and second halves of the video, respectively."
    "Step 2: Analyze Each Part"
    "Once the video is divided, use the get_video_description tool to analyze each part."
    "Invoke the tool for the first part using the command: get_video_description('folder_path_1')."
    "Repeat the process for the second part with the command: get_video_description('folder_path_2')."
    "Each call to get_video_description will return a description of the respective video part."
    "Step 3: Aggregate Findings"
    "After receiving the descriptions for both parts, your next task is to aggregate these findings."
    "Combine the key points from each description to form a comprehensive summary and answer the user questions."
    "Focus on highlighting any significant observations, thematic elements, or notable content from each part of the video."
    "Step 4: Final Submission"
    "Compile your aggregated summary into a final report."
    "Ensure that the report is clear and concise, providing a complete understanding of the entire video's content based on your analysis."
    "When the report is ready, submit it with the title: “Comprehensive Analysis of example_video.mp4”."
    "Remember: Accuracy and thoroughness are key in this task. Ensure that you capture all relevant details in your analysis and report."
    "When finished, respond with FINISH."
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="chat_history"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", streaming=True)

# Construct the OpenAI Functions agent
agent_runnable = create_openai_functions_agent(llm, tools, prompt)

In [5]:
from typing import TypedDict, Annotated, List, Union
from langchain_core.agents import AgentAction, AgentFinish
from langchain_core.messages import BaseMessage
import operator
import base64
class AgentState(TypedDict):
    input: str
    chat_history: list[BaseMessage]
    agent_outcome: Union[AgentAction, AgentFinish, None]
    intermediate_steps: Annotated[list[tuple[AgentAction, str]], operator.add]

In [6]:
from langgraph.prebuilt.tool_executor import ToolExecutor

tool_executor = ToolExecutor(tools)

def run_agent(data):
    agent_outcome = agent_runnable.invoke(data)
    return {"agent_outcome": agent_outcome}

def execute_tools(data):
    agent_action = data['agent_outcome']
    output = tool_executor.invoke(agent_action)
    return {"intermediate_steps": [(agent_action, str(output))]}

def should_continue(data):
    if isinstance(data["agent_outcome"], AgentFinish):
        return "end"
    else:
        return "continue"

In [7]:
from langgraph.graph import END, StateGraph

workflow = StateGraph(AgentState)

workflow.add_node("agent", run_agent)
workflow.add_node("action", execute_tools)

workflow.set_entry_point("agent")

workflow.add_conditional_edges(
    "agent",
    should_continue,
    {
        "continue": "action",
        "end": END,
    },
)

workflow.add_edge("action", "agent")
app = workflow.compile()

In [8]:
inputs = {'input': 'Describe this video in details. Its path is ./hosptial.mp4','chat_history': []}
for s in app.stream(inputs):
    print(list(s.values())[0])
    print("----")

{'agent_outcome': AgentActionMessageLog(tool='split_video', tool_input={'video_path': './hosptial.mp4'}, log="\nInvoking: `split_video` with `{'video_path': './hosptial.mp4'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"video_path":"./hosptial.mp4"}', 'name': 'split_video'}})])}
----
success
{'intermediate_steps': [(AgentActionMessageLog(tool='split_video', tool_input={'video_path': './hosptial.mp4'}, log="\nInvoking: `split_video` with `{'video_path': './hosptial.mp4'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"video_path":"./hosptial.mp4"}', 'name': 'split_video'}})]), "['analyst1_frames', 'analyst2_frames']")]}
----
{'agent_outcome': AgentActionMessageLog(tool='get_video_description', tool_input={'question': 'Please describe the first half of the video.', 'path': 'analyst1_frames'}, log="\nInvoking: `get_video_description` with `{'question': 'Please describe the first half of th

In [18]:
print(app.invoke(inputs))

success
{'input': 'What is label on the mask in the first part of this video. Its path is ./hosptial.mp4', 'chat_history': [], 'agent_outcome': AgentFinish(return_values={'output': 'In the first part of the video, the label on the mask reads "ARMY EMERGENCY RESPIRATOR."'}, log='In the first part of the video, the label on the mask reads "ARMY EMERGENCY RESPIRATOR."'), 'intermediate_steps': [(AgentActionMessageLog(tool='split_video', tool_input={'video_path': './hosptial.mp4'}, log="\nInvoking: `split_video` with `{'video_path': './hosptial.mp4'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"video_path":"./hosptial.mp4"}', 'name': 'split_video'}})]), "['analyst1_frames', 'analyst2_frames']"), (AgentActionMessageLog(tool='get_video_description', tool_input={'question': 'What is the label on the mask in the first part of the video?', 'path': 'analyst1_frames'}, log="\nInvoking: `get_video_description` with `{'question': 'What is the label

In [19]:
inputs

{'input': 'What is label on the mask in the first part of this video. Its path is ./hosptial.mp4',
 'chat_history': []}

In [10]:
s

{'__end__': {'input': 'Describe this video in details. Its path is ./hosptial.mp4',
  'chat_history': [],
  'agent_outcome': AgentFinish(return_values={'output': 'The video "hospital.mp4" has been split into two parts for analysis. Here are the key findings from each part:\n\nFirst Half of the Video:\nThe first half of the video features two distinct scenes. The first scene depicts a person wearing an older military or medical-style oxygen mask labeled "ARMY EMERGENCY" on the rebreather bag. The person is shown from various angles, suggesting a focus on the mask and its use. Interspersed with these shots are images of a person slicing a freshly baked pizza on a wooden board. The pizza is being prepared in a food setting, and the person cutting it is wearing gloves. The alternating sequence of the oxygen mask and pizza suggests a thematic or narrative contrast, although the specific connection or storyline is not clear.\n\nSecond Half of the Video:\nThe second half of the video alternat

In [26]:
print(s['__end__']['agent_outcome'].return_values['output'])

The video "hospital.mp4" has been split into two parts for analysis. Here are the key findings from each part:

First Half of the Video:
The first half of the video features two distinct scenes. The first scene depicts a person wearing an older military or medical-style oxygen mask labeled "ARMY EMERGENCY" on the rebreather bag. The person is shown from various angles, suggesting a focus on the mask and its use. Interspersed with these shots are images of a person slicing a freshly baked pizza on a wooden board. The pizza is being prepared in a food setting, and the person cutting it is wearing gloves. The alternating sequence of the oxygen mask and pizza suggests a thematic or narrative contrast, although the specific connection or storyline is not clear.

Second Half of the Video:
The second half of the video alternates between two settings. The first setting appears to be a laboratory or workshop environment from an older era, possibly mid-20th century, featuring scientific or techn