# ðŸŽ¯ Multi-Agent Multimodal Analysis with FAISS Memory


In [None]:
!pip install mem0ai
!pip install opensearch-py
!pip install faiss-cpu

In [None]:
import boto3
import os
import json
from datetime import datetime

from strands.models import BedrockModel
from strands import Agent
from strands_tools import image_reader, file_read, mem0_memory,use_llm
from video_reader import video_reader
from strands.tools import tool


## ðŸ¤– Agent Configuration with FAISS Memory


In [None]:
# System prompt with memory capabilities instruction
MULTIMODAL_SYSTEM_PROMPT = """ You are a helpful assistant that can process documents, images, and videos. 
Analyze their contents and provide relevant information. You have memory capabilities and can remember previous interactions.

You can:

1. For PNG, JPEG/JPG, GIF, or WebP formats use image_reader to process file
2. For PDF, csv, docx, xls or xlsx formats use file_read to process file  
3. For MP4, MOV, AVI, MKV, WebM formats use video_reader to process file
4. Just deliver the answer

memory capabilities:
- Store new information using mem0_memory tool (action="store")
- Retrieve relevant memories (action="retrieve")
- List all memories (action="list")
- Provide personalized responses

Key Rules:
- Always include user_id={USER_ID} in tool calls
- Be conversational and natural in responses
- Format output clearly
- Acknowledge stored information
- Only share relevant information
- Politely indicate when information is unavailable


When displaying responses:
- Format answers data in a human-readable way
- Highlight important information
- Handle errors appropriately
- Convert technical terms to user-friendly language
- Always reply in the original user language
- Reference relevant past interactions when appropriate

Always reply in the original user language.
"""

# Configure AWS session for Bedrock access
session = boto3.Session(region_name='us-west-2')

# Initialize Bedrock model for inference
bedrock_model = BedrockModel(
    model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    #model_id="us.amazon.nova-pro-v1:0",
    boto_session=session,
    streaming=False
)

# Create enhanced agent with memory capabilities
multimodal_agent = Agent(
    system_prompt=MULTIMODAL_SYSTEM_PROMPT,
    tools=[image_reader, file_read, video_reader, mem0_memory,use_llm],
    model=bedrock_model,
)

## ðŸŽ¯ Initialize some demo memories to showcase functionality

In [None]:
USER_ID = "eli_abc"  # Generate a unique user ID
content = """Hello, my name is Elizabeth, but they call me Eli. I'm a developer advocate at AWS, and I want to understand what's in images, videos, and documents to improve my day-to-day work.""" 
multimodal_agent.tool.mem0_memory(action="store", content=content, user_id=USER_ID)


## ðŸŽ¯ Usage Examples with Memory



In [None]:
# Example 1: Image analysis with memory storage
print("=== ðŸ“¸ IMAGE ANALYSIS WITH MEMORY ===")
image_result = multimodal_agent(f"Analyze the image data-sample/diagram.jpg in detail and describe everything you observe. Remember this information for later. USER_ID")
print(image_result)
print("\n" + "="*80 + "\n")

In [None]:
# Extract text content from response
image_result.message['content'][0]['text']

In [None]:
# Retrieve memories
retrieved_memories = multimodal_agent.tool.mem0_memory(
    action="retrieve", query="What services are in the image?", user_id=USER_ID
)
print("Retrieved Memories:", retrieved_memories)

In [None]:

# List all stored memories
print("All Stored Memories:")
all_memories = multimodal_agent.tool.mem0_memory(
    action="list", user_id=USER_ID
)

In [None]:
# Extract text content from response
print(image_result.message['content'][0]['text'])

In [None]:
# Example 2: Video analysis with memory storage
print("=== ðŸŽ¬ VIDEO ANALYSIS WITH MEMORY ===")
video_result = multimodal_agent("Analyze the video data-sample/moderation-video.mp4 and describe in detail the actions and scenes you observe. Store this information in your memory.")
print(video_result)
print("\n" + "="*80 + "\n")

In [None]:
# Extract text content from response
video_result.message['content'][0]['text']

In [None]:
# Example 3: Document analysis with memory storage
print("=== ðŸ“„ DOCUMENT ANALYSIS WITH MEMORY ===")
doc_result = multimodal_agent("Summarize as json the content of the document data-sample/Welcome-Strands-Agents-SDK.pdf and store this information in your memory.")
print(doc_result)


In [None]:
# Extract text content from response
doc_result.message['content'][0]['text']

In [None]:
# Example 4: Testing memory recall across multiple media types
print("=== ðŸ§  MEMORY RECALL TEST ===")
memory_result = multimodal_agent("What do you remember about the image, video, and document I showed you earlier?")
print(memory_result)

In [None]:
# Extract text content from response
memory_result.message['content'][0]['text']