In [None]:
import os, getpass
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
from IPython.display import Image, display
from langgraph.graph import START, END, StateGraph
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from typing import List
from typing_extensions import TypedDict
from pydantic import BaseModel, Field
from langgraph.graph import MessagesState
from langchain_core.messages import AnyMessageb , add_messages


In [None]:
llm = ChatOpenAI(model="gpt-5", temperature=0)

### Tools

In [None]:
class RoomState(BaseModel):
    """Represents the latest state of a room from MongoDB."""
    room_number: int
    room_name: str
    video_description: str
    audio_transcript: str
    screenshot_path: str
    video_path: str
    audio_path: str
    timestamp: datetime


@dataclass
class SearchResult:
    """Result of an object search operation."""
    found: bool
    room_number: Optional[int] = None
    room_name: Optional[str] = None
    description: Optional[str] = None
    highlighted_image_path: Optional[str] = None

In [None]:
async def get_latest_room_states(
    mongo_connection_string: Optional[str] = None
) -> List[RoomState]:
    """
    Connects to MongoDB and retrieves the most recent document for each room.
    
    Args:
        mongo_connection_string: Optional MongoDB URI. Defaults to env var or localhost.
        
    Returns:
        List of RoomState objects, one per room with the latest data.
    """
    if mongo_connection_string is None:
        mongo_connection_string = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
    
    client = AsyncIOMotorClient(mongo_connection_string)
    db = client.dementia_assistance
    collection = db.events
    
    # Aggregation pipeline: get latest document per room
    pipeline = [
        {"$sort": {"time": -1}},  # Sort by time descending
        {"$group": {
            "_id": "$room_number",
            "latest_doc": {"$first": "$$ROOT"}
        }},
        {"$replaceRoot": {"newRoot": "$latest_doc"}}
    ]
    
    room_states = []
    async for doc in collection.aggregate(pipeline):
        room_num = doc.get("room_number", 0)
        room_states.append(RoomState(
            room_number=room_num,
            room_name=ROOMS.get(room_num, f"Room {room_num}"),
            video_description=doc.get("video_description", ""),
            audio_transcript=doc.get("audio_transcript", ""),
            screenshot_path=doc.get("screenshot_path", ""),
            video_path=doc.get("video_path", ""),
            audio_path=doc.get("audio_path", ""),
            timestamp=doc.get("time", datetime.now())
        ))
    
    client.close()
    return room_states



async def get_specific_room_object():
    """
        Idea is that it is used when the user is searching for a specific object in a specific room that they are aware of.
    """
    pass


async def search_object_in_rooms_descriptions(
    object_name: str,
    room_states: List[RoomState]
):
    """
        We will pass our room documents to this function and it will return the room where the object has been interacted wiht by the user.
    """
    concat_room_documents = ""
    for room in room_states:
        concat_room_documents += f"""
Room Number: {room.room_number}
Room Name: {room.room_name}
Video Description: {room.video_description}
Audio Transcript: {room.audio_transcript}
---
"""
    prompt = f"""
Analyse the following room documents, searching for the object "{object_name}" in the video description and audio transcript. 
If the object is mentioned and sounds like it is currently in the room (not removed), return ONLY the room number as an integer.
If the object is not found or has been removed, return "None".

Room Documents:
{concat_room_documents}

Return format: Just the room number (e.g., "5") or "None"
    """

    concat_room_documents = {
        Essemtially should include room number, room name , video description, audio transcript

    }






    


async def search_object_in_descriptions(
    object_name: str,
    room_states: List[RoomState]
) -> Optional[RoomState]:
    """
    Uses Gemini to analyze if the object is mentioned in any video description.
    
    Args:
        object_name: The object to search for (e.g., "xbox controller")
        room_states: List of RoomState objects to search through
        
    Returns:
        RoomState where object was found, or None if not found in any description.
    """
    for room in room_states:
        if not room.video_description:
            continue
            
        prompt = f"""Analyze this video description to determine if it mentions or describes a "{object_name}".
        
Video Description:
{room.video_description}

Answer with ONLY "YES" or "NO" followed by a brief explanation if YES.
Example: "YES - The description mentions the user placed an xbox controller on the coffee table."
"""
        
        response = gemini_client.models.generate_content(
            model="gemini-2.5-flash",
            contents=[prompt]
        )
        
        answer = response.text.strip().upper()
        if answer.startswith("YES"):
            print(f"[ImageSearch] Found '{object_name}' in {room.room_name} description")
            return room
    
    print(f"[ImageSearch] '{object_name}' not found in any video descriptions")
    return None


async def search_object_in_image(
    object_name: str,
    image_path: str
) -> tuple[bool, str]:
    """
    Uses GPT-4 Vision to visually inspect an image for the specified object.
    
    Args:
        object_name: The object to search for
        image_path: Path to the screenshot image
        
    Returns:
        Tuple of (found: bool, description: str describing location or reason not found)
    """
    import base64
    
    # Read and encode image
    with open(image_path, "rb") as img_file:
        image_data = base64.b64encode(img_file.read()).decode("utf-8")
    
    # Determine image type from extension
    ext = os.path.splitext(image_path)[1].lower()
    media_type = "image/jpeg" if ext in [".jpg", ".jpeg"] else "image/png"
    
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""You are helping a dementia patient find their belongings.
                        
Look carefully at this image and determine if you can see a "{object_name}".

If you CAN see it:
- Answer starting with "YES"
- Describe its exact location in the image (e.g., "on the desk near the window", "on the floor next to the couch")

If you CANNOT see it:
- Answer starting with "NO"
- Briefly explain why (e.g., "The object is not visible in this image")
"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{image_data}"
                        }
                    }
                ]
            }
        ],
        max_tokens=300
    )
    
    answer = response.choices[0].message.content.strip()
    found = answer.upper().startswith("YES")
    
    return found, answer




In [1]:
import base64
from openai import OpenAI

client = OpenAI()

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


# Path to your image
image_path = r"C:\Users\amogh\OneDrive\Pictures\DALLÂ·E 2022-10-05 13.04.31 - Among us in real life.png"

# Getting the Base64 string
base64_image = encode_image(image_path)


response = client.responses.create(
    model="gpt-4.1",
    input=[
        {
            "role": "user",
            "content": [
                { "type": "input_text", "text": "what's in this image?" },
                {
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{base64_image}",
                },
            ],
        }
    ],
)

print(response.output_text)

This image shows two people standing close to each other, seemingly in a playful or affectionate moment. The woman is smiling and holding the man's chin, while the man has his eyes closed and face scrunched up, possibly pretending to be annoyed or resisting but in a humorous, light-hearted way. They appear to be indoors, possibly in an elevator, given the metallic walls and lighting. The overall mood seems friendly and playful.


In [4]:
from agents import Agent, ModelSettings, function_tool, Runner

@function_tool
def node_a()->str :
    """This node returns a string which the other nodes will use"""
    print("Node A")
    return f"I am Node A"

@function_tool
def node_b(prev_node : str):   
    """Prints the previous node's output and sends its own to the next one"""
    print(prev_node)
    return f"I am harambe"

@function_tool
def node_c(prev_node: str):
    """returns weather info for the specified city."""
    print(prev_node)


agent = Agent(
    name="Haiku agent",
    instructions="Execute the following tools in the following order: node_a, node_b, node_c",
    model="gpt-5-nano",
    tools=[node_a, node_b, node_c],
)

result = await Runner.run(agent, "run")

Node A
I am Node A
I am harambe
