In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))
from tool_monkey import MonkeyObserver, create_tool_with_monkey, content_policy_violation, logger, setup_default_logging

In [None]:
setup_default_logging(level=10)

# Content Moderation Examples

This notebook demonstrates how agents handle content moderation failures - a unique failure mode specific to LLM-powered tools.

Content filters are common in:
- **Image generation APIs** (DALL-E, Midjourney, Stable Diffusion)
- **Text generation APIs** (OpenAI GPT, Anthropic Claude)
- **Content platforms** (social media, user-generated content sites)

Unlike timeouts or rate limits, content moderation failures require the agent to understand *why* the request was rejected and potentially rephrase or abandon the request.

## Example 1: Image Generation with Content Filter

**Use Case:** Agent generates images for a creative project. One prompt triggers the content policy filter.

**What Happens:**
1. User asks agent to generate fantasy character artwork
2. Agent creates prompts for different characters
3. **First 2 images:** Generated successfully ✅ ✅
4. **Third image:** Content policy violation! ❌ (triggered violence/weapon filter)
5. Agent sees error: "Content policy violation: violence"
6. Agent acknowledges the error and continues with other images

**What This Shows:**
- Content filters block specific requests, not all requests
- Agent must handle rejection gracefully (can't just retry - same prompt will fail again)
- Real pattern: DALL-E, Midjourney, and Stable Diffusion all have content filters
- Different from technical failures - this is a *policy* issue, not a transient error

**Scenario Configuration:**
- `content_policy_violation(reason="violence")` - triggers on 3rd call
- Simulates: Content safety filter detecting prohibited content

In [None]:
def content_moderation_example():
    from langchain_examples.shared.llm import llm
    from langchain_examples.shared.tools import base_image_gen_tool, ImageGenInput
    
    observer = MonkeyObserver()
    # Content filter triggers on 3rd call
    scenario = content_policy_violation(reason="violence")
    
    generate_image = create_tool_with_monkey(
        base_image_gen_tool,
        scenario,
        observer,
        tool_name="generate_image",
        args_schema=ImageGenInput
    )
    
    system_prompt = """You are a creative assistant helping generate fantasy character artwork.
When asked to create multiple images, call the generate_image tool for each character."""
    
    user_prompt = """Generate 5 fantasy character images in a dramatic style:
1. A wise old wizard with a staff
2. A brave knight in shining armor  
3. A dark warrior wielding a flaming sword
4. A mystical elf archer in a forest
5. A dragon perched on a mountain"""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    llm_with_tools = llm.bind_tools([generate_image])
    
    max_iterations = 10
    for i in range(max_iterations):
        ai_msg = llm_with_tools.invoke(messages)
        messages.append(ai_msg)
        
        if not ai_msg.tool_calls:
            print(f"Agent finished: {ai_msg.content}")
            break
        
        for tool_call in ai_msg.tool_calls:
            logger.debug(f"Invoking tool: {tool_call.get('name')} with args {tool_call.get('args')}")
            try:
                result = generate_image.invoke(tool_call)
                messages.append(result)
                print(f"✅ Generated image: {tool_call['args']['prompt'][:50]}...")
            except Exception as e:
                print(f"❌ Content filter triggered: {e}")
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call["id"],
                    "content": f"Error: {e}"
                })
    
    print("\n" + "=" * 50)
    print("OBSERVER METRICS:")
    print("=" * 50)
    print(observer.summary())

content_moderation_example()

## Example 2: Multiple Content Filters

**Use Case:** Social media content creation agent encounters multiple content policy issues.

**What Happens:**
1. Agent generates social media images for a marketing campaign
2. First image: Triggers NSFW filter ❌
3. Second image: Generates successfully ✅
4. Third image: Triggers hate speech filter ❌
5. Agent must acknowledge which content policies were violated

**What This Shows:**
- Different types of content violations (NSFW, hate speech, violence, etc.)
- Agent sees different error reasons in the response
- Testing how agents handle and report policy violations to users

**Scenario Configuration:**
- Multiple calls with different violation reasons
- Simulates: Real content moderation systems with multiple filter types

In [None]:
def multi_filter_example():
    from langchain_examples.shared.llm import llm
    from langchain_examples.shared.tools import base_image_gen_tool, ImageGenInput
    from tool_monkey import with_monkey, FailureScenario, ToolFailure
    from langchain_core.tools import tool
    
    observer = MonkeyObserver()
    
    # Create scenario with multiple content moderation failures
    scenario = FailureScenario(
        name="multi_content_filter",
        failures=[
            ToolFailure(
                error_type="content_moderation",
                on_call_count=1,
                config={
                    "content_moderation": {
                        "content_categories": {"nsfw_content": True},
                        "reason": "NSFW content detected"
                    }
                }
            ),
            ToolFailure(
                error_type="content_moderation",
                on_call_count=3,
                config={
                    "content_moderation": {
                        "content_categories": {"hate_speech": True},
                        "reason": "Hate speech detected"
                    }
                }
            ),
        ]
    )
    
    generate_image = create_tool_with_monkey(
        base_image_gen_tool,
        scenario,
        observer,
        tool_name="generate_image",
        args_schema=ImageGenInput
    )
    
    system_prompt = """You are a social media content creator.
Generate images for marketing posts using the generate_image tool."""
    
    user_prompt = """Create 4 marketing images for our new product launch.
Use a modern, professional style."""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    llm_with_tools = llm.bind_tools([generate_image])
    
    max_iterations = 8
    for i in range(max_iterations):
        ai_msg = llm_with_tools.invoke(messages)
        messages.append(ai_msg)
        
        if not ai_msg.tool_calls:
            print(f"Agent finished: {ai_msg.content}")
            break
        
        for tool_call in ai_msg.tool_calls:
            try:
                result = generate_image.invoke(tool_call)
                messages.append(result)
                print(f"✅ Generated: {tool_call['args']['prompt'][:40]}...")
            except Exception as e:
                print(f"❌ Filter triggered: {e}")
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call["id"],
                    "content": f"Content policy violation: {e}"
                })
    
    print("\n" + "=" * 50)
    print("OBSERVER METRICS:")
    print("=" * 50)
    print(observer.summary())

multi_filter_example()